Hi Tom

On 06.09.24 18:34, Tom Lane wrote:
> I think it'd be quite foolish to assume that every extant and future
> version of libxml2 will share this glitch.  Probably should use
> logic more like pg_strip_crlf(), although we can't use that directly.
Makes sense. I Introduced this logic in the end of
xmltotext_with_options() in case it was called with INDENT and DOCUMENT
type xml string.

SELECT xmlserialize(DOCUMENT '<foo><bar>42</bar></foo>' AS text INDENT);
  xmlserialize   
-----------------
 <foo>          +
   <bar>42</bar>+
 </foo>
(1 row)

The regression tests were updated accordingly - see patch v2-0002.
> Would it ever be the case that trailing whitespace would be valid
> data?  In a bit of testing, it seems like that could be true in
> CONTENT mode but not DOCUMENT mode.
Yes, in case of CONTENT it is valid data and it will be preserved, as
CONTENT can be pretty much anything.

SELECT xmlserialize(CONTENT E'<foo><bar>42</bar></foo>\n\n\t\t\t' AS
text INDENT);
       xmlserialize       
--------------------------
 <foo>                   +
   <bar>42</bar>         +
 </foo>                  +
                         +
                         
(1 row)


With DOCUMENT it is superfluous and should be removed after indentation.
IIRC there's an xmlSaveToBuffer option called XML_SAVE_WSNONSIG that can
be used to preserve it.

Thanks

Best, Jim
From ba0aa7c2a822bd23a9b2ec5af07265bc3eba86ce Mon Sep 17 00:00:00 2001
From: Jim Jones <jim.jo...@uni-muenster.de>
Date: Fri, 6 Sep 2024 23:48:24 +0200
Subject: [PATCH v2 2/2] Bug fix: remove default trailing newline from
 XMLSERIALIZE calls

xmlDocContentDumpOutput adds by default a trailing newline to
DOCUMENT typed XML strings when XMLSERIALIZE is called with the
option INDENT. This introduces a condition that checks if the
serialized DOCUMENT contains a trailing newline and if so removes
it. Regression tests were updated accordingly.
---
 src/backend/utils/adt/xml.c         | 18 +++++++++++++++++-
 src/test/regress/expected/xml.out   | 18 ++++++------------
 src/test/regress/expected/xml_2.out | 18 ++++++------------
 3 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 1cd4929870..fa38d2b992 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -808,7 +808,23 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent)
 						"could not close xmlSaveCtxtPtr");
 		}
 
-		result = (text *) xmlBuffer_to_xmltype(buf);
+		/*
+		 * xmlDocContentDumpOutput adds a trailing newline by default
+		 * so we get rid of it here.
+		 */
+		if (xmloption_arg == XMLOPTION_DOCUMENT)
+		{
+			char *str = (char *) xmlBufferContent(buf);
+			int len = xmlBufferLength(buf);
+
+			while (len > 0 && (str[len - 1] == '\n' ||
+								str[len - 1] == '\r'))
+				str[--len] = '\0';
+
+			result = cstring_to_text_with_len(str, len);
+		}
+		else
+			result = (text *) xmlBuffer_to_xmltype(buf);
 	}
 	PG_CATCH();
 	{
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index 6f073101a1..361a6f9b27 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -485,8 +485,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val></bar></foo>' AS text
    <bar>                +
      <val x="y">42</val>+
    </bar>               +
- </foo>                 +
- 
+ </foo>
 (1 row)
 
 SELECT xmlserialize(CONTENT  '<foo><bar><val x="y">42</val></bar></foo>' AS text INDENT);
@@ -546,8 +545,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val><val x="y">text node<
      <val x="y">42</val>                    +
      <val x="y">text node<val>73</val></val>+
    </bar>                                   +
- </foo>                                     +
- 
+ </foo>
 (1 row)
 
 SELECT xmlserialize(CONTENT  '<foo><bar><val x="y">42</val><val x="y">text node<val>73</val></val></bar></foo>' AS text INDENT);
@@ -601,8 +599,7 @@ SELECT xmlserialize(DOCUMENT '<?xml version="1.0" encoding="UTF-8"?><foo><bar><v
    <bar>                               +
      <val>73</val>                     +
    </bar>                              +
- </foo>                                +
- 
+ </foo>
 (1 row)
 
 SELECT xmlserialize(CONTENT  '<?xml version="1.0" encoding="UTF-8"?><foo><bar><val>73</val></bar></foo>' AS text INDENT);
@@ -620,8 +617,7 @@ SELECT xmlserialize(DOCUMENT '<!DOCTYPE a><a/>' AS text INDENT);
  xmlserialize 
 --------------
  <!DOCTYPE a>+
- <a/>        +
- 
+ <a/>
 (1 row)
 
 SELECT xmlserialize(CONTENT  '<!DOCTYPE a><a/>' AS text INDENT);
@@ -638,8 +634,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar></bar></foo>' AS text INDENT);
 --------------
  <foo>       +
    <bar/>    +
- </foo>      +
- 
+ </foo>
 (1 row)
 
 SELECT xmlserialize(CONTENT  '<foo><bar></bar></foo>' AS text INDENT);
@@ -669,8 +664,7 @@ SELECT xmlserialize(DOCUMENT '<foo>   <bar></bar>    </foo>' AS text INDENT);
 --------------
  <foo>       +
    <bar/>    +
- </foo>      +
- 
+ </foo>
 (1 row)
 
 SELECT xmlserialize(CONTENT  'text node<foo>    <bar></bar>   </foo>' AS text INDENT);
diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out
index 7b154da4ba..73c2851d3f 100644
--- a/src/test/regress/expected/xml_2.out
+++ b/src/test/regress/expected/xml_2.out
@@ -471,8 +471,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val></bar></foo>' AS text
    <bar>                +
      <val x="y">42</val>+
    </bar>               +
- </foo>                 +
- 
+ </foo>
 (1 row)
 
 SELECT xmlserialize(CONTENT  '<foo><bar><val x="y">42</val></bar></foo>' AS text INDENT);
@@ -532,8 +531,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val><val x="y">text node<
      <val x="y">42</val>                    +
      <val x="y">text node<val>73</val></val>+
    </bar>                                   +
- </foo>                                     +
- 
+ </foo>
 (1 row)
 
 SELECT xmlserialize(CONTENT  '<foo><bar><val x="y">42</val><val x="y">text node<val>73</val></val></bar></foo>' AS text INDENT);
@@ -587,8 +585,7 @@ SELECT xmlserialize(DOCUMENT '<?xml version="1.0" encoding="UTF-8"?><foo><bar><v
    <bar>                               +
      <val>73</val>                     +
    </bar>                              +
- </foo>                                +
- 
+ </foo>
 (1 row)
 
 SELECT xmlserialize(CONTENT  '<?xml version="1.0" encoding="UTF-8"?><foo><bar><val>73</val></bar></foo>' AS text INDENT);
@@ -606,8 +603,7 @@ SELECT xmlserialize(DOCUMENT '<!DOCTYPE a><a/>' AS text INDENT);
  xmlserialize 
 --------------
  <!DOCTYPE a>+
- <a/>        +
- 
+ <a/>
 (1 row)
 
 SELECT xmlserialize(CONTENT  '<!DOCTYPE a><a/>' AS text INDENT);
@@ -624,8 +620,7 @@ SELECT xmlserialize(DOCUMENT '<foo><bar></bar></foo>' AS text INDENT);
 --------------
  <foo>       +
    <bar/>    +
- </foo>      +
- 
+ </foo>
 (1 row)
 
 SELECT xmlserialize(CONTENT  '<foo><bar></bar></foo>' AS text INDENT);
@@ -655,8 +650,7 @@ SELECT xmlserialize(DOCUMENT '<foo>   <bar></bar>    </foo>' AS text INDENT);
 --------------
  <foo>       +
    <bar/>    +
- </foo>      +
- 
+ </foo>
 (1 row)
 
 SELECT xmlserialize(CONTENT  'text node<foo>    <bar></bar>   </foo>' AS text INDENT);
-- 
2.34.1

From e474cdc457a91e96432f5a14c69b4ecbc0345579 Mon Sep 17 00:00:00 2001
From: Jim Jones <jim.jo...@uni-muenster.de>
Date: Thu, 29 Aug 2024 19:41:02 +0200
Subject: [PATCH v2 1/2] Bug fix for XMLSERIALIZE(...INDENT) for xml containing
 blank nodes

This fixes a bug that let XMLSERIALIZE(... INDENT) to ignore
blank nodes. It basically sets xml_parse's parameter
'preserve_whitespace' to false if the INDENT flag was used
in XMLSERIALIZE. New regression tests are also included.
---
 src/backend/utils/adt/xml.c         | 10 ++++++++--
 src/test/regress/expected/xml.out   | 19 +++++++++++++++++++
 src/test/regress/expected/xml_1.out | 11 +++++++++++
 src/test/regress/expected/xml_2.out | 19 +++++++++++++++++++
 src/test/regress/sql/xml.sql        |  3 +++
 5 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index 447e72b21e..1cd4929870 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -677,8 +677,14 @@ xmltotext_with_options(xmltype *data, XmlOptionType xmloption_arg, bool indent)
 	}
 
 #ifdef USE_LIBXML
-	/* Parse the input according to the xmloption */
-	doc = xml_parse(data, xmloption_arg, true, GetDatabaseEncoding(),
+	/*
+	 * Parse the input according to the xmloption
+	 * preserve_whitespace is set to false in case the function should
+	 * return an indented xml, otherwise libxml2 will ignore the elements
+	 * that contain whitespaces between them.
+	 */
+	doc = xml_parse(data, xmloption_arg, !indent ? true : false,
+					GetDatabaseEncoding(),
 					&parsed_xmloptiontype, &content_nodes,
 					(Node *) &escontext);
 	if (doc == NULL || escontext.error_occurred)
diff --git a/src/test/regress/expected/xml.out b/src/test/regress/expected/xml.out
index 93a79cda8f..6f073101a1 100644
--- a/src/test/regress/expected/xml.out
+++ b/src/test/regress/expected/xml.out
@@ -663,6 +663,25 @@ SELECT xmlserialize(CONTENT  '<foo><bar><val x="y">42</val></bar></foo>' AS text
  t
 (1 row)
 
+-- indent xml strings containing blank nodes
+SELECT xmlserialize(DOCUMENT '<foo>   <bar></bar>    </foo>' AS text INDENT);
+ xmlserialize 
+--------------
+ <foo>       +
+   <bar/>    +
+ </foo>      +
+ 
+(1 row)
+
+SELECT xmlserialize(CONTENT  'text node<foo>    <bar></bar>   </foo>' AS text INDENT);
+ xmlserialize 
+--------------
+ text node   +
+ <foo>       +
+   <bar/>    +
+ </foo>
+(1 row)
+
 SELECT xml '<foo>bar</foo>' IS DOCUMENT;
  ?column? 
 ----------
diff --git a/src/test/regress/expected/xml_1.out b/src/test/regress/expected/xml_1.out
index 9323b84ae2..d26e10441e 100644
--- a/src/test/regress/expected/xml_1.out
+++ b/src/test/regress/expected/xml_1.out
@@ -443,6 +443,17 @@ ERROR:  unsupported XML feature
 LINE 1: SELECT xmlserialize(CONTENT  '<foo><bar><val x="y">42</val><...
                                      ^
 DETAIL:  This functionality requires the server to be built with libxml support.
+-- indent xml strings containing blank nodes
+SELECT xmlserialize(DOCUMENT '<foo>   <bar></bar>    </foo>' AS text INDENT);
+ERROR:  unsupported XML feature
+LINE 1: SELECT xmlserialize(DOCUMENT '<foo>   <bar></bar>    </foo>'...
+                                     ^
+DETAIL:  This functionality requires the server to be built with libxml support.
+SELECT xmlserialize(CONTENT  'text node<foo>    <bar></bar>   </foo>' AS text INDENT);
+ERROR:  unsupported XML feature
+LINE 1: SELECT xmlserialize(CONTENT  'text node<foo>    <bar></bar> ...
+                                     ^
+DETAIL:  This functionality requires the server to be built with libxml support.
 SELECT xml '<foo>bar</foo>' IS DOCUMENT;
 ERROR:  unsupported XML feature
 LINE 1: SELECT xml '<foo>bar</foo>' IS DOCUMENT;
diff --git a/src/test/regress/expected/xml_2.out b/src/test/regress/expected/xml_2.out
index f956322c69..7b154da4ba 100644
--- a/src/test/regress/expected/xml_2.out
+++ b/src/test/regress/expected/xml_2.out
@@ -649,6 +649,25 @@ SELECT xmlserialize(CONTENT  '<foo><bar><val x="y">42</val></bar></foo>' AS text
  t
 (1 row)
 
+-- indent xml strings containing blank nodes
+SELECT xmlserialize(DOCUMENT '<foo>   <bar></bar>    </foo>' AS text INDENT);
+ xmlserialize 
+--------------
+ <foo>       +
+   <bar/>    +
+ </foo>      +
+ 
+(1 row)
+
+SELECT xmlserialize(CONTENT  'text node<foo>    <bar></bar>   </foo>' AS text INDENT);
+ xmlserialize 
+--------------
+ text node   +
+ <foo>       +
+   <bar/>    +
+ </foo>
+(1 row)
+
 SELECT xml '<foo>bar</foo>' IS DOCUMENT;
  ?column? 
 ----------
diff --git a/src/test/regress/sql/xml.sql b/src/test/regress/sql/xml.sql
index 953bac09e4..f752ecb142 100644
--- a/src/test/regress/sql/xml.sql
+++ b/src/test/regress/sql/xml.sql
@@ -168,6 +168,9 @@ SELECT xmlserialize(CONTENT  '<foo><bar></bar></foo>' AS text INDENT);
 -- 'no indent' = not using 'no indent'
 SELECT xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val></bar></foo>' AS text) = xmlserialize(DOCUMENT '<foo><bar><val x="y">42</val></bar></foo>' AS text NO INDENT);
 SELECT xmlserialize(CONTENT  '<foo><bar><val x="y">42</val></bar></foo>' AS text) = xmlserialize(CONTENT '<foo><bar><val x="y">42</val></bar></foo>' AS text NO INDENT);
+-- indent xml strings containing blank nodes
+SELECT xmlserialize(DOCUMENT '<foo>   <bar></bar>    </foo>' AS text INDENT);
+SELECT xmlserialize(CONTENT  'text node<foo>    <bar></bar>   </foo>' AS text INDENT);
 
 SELECT xml '<foo>bar</foo>' IS DOCUMENT;
 SELECT xml '<foo>bar</foo><bar>foo</bar>' IS DOCUMENT;
-- 
2.34.1

Reply via email to