23 января 2009 г. 0:58 пользователь Peter Eisentraut <pete...@gmx.net> написал:
> On Thursday 22 January 2009 15:39:00 Sergey Burladyan wrote:
>> seb=# select xpath('/русский/text()', v::xml) from (select
>> xml('<русский>язык</русский>')) as x(v);
>> ERROR:  could not parse XML data
>> DETAIL:  Entity: line 1: parser error : Input is not proper UTF-8, indicate
>> encoding !
>> Bytes: 0xF0 0xF3 0xF1 0xF1
>> <x><русский>язык</русский></x>
>>     ^

> This raises the question: What are the rules about encoding the characters in
> XPath expressions themselves?  I haven't found anything about that in the
> standard.  Anyone know?

PostgreSQL does not use libxml2 internal encoding support and strip
xml encoding from xml body, so i think there is no choice, by default
for libxml2 it must be in it internal encoding utf-8 anyway.

i am not sure about xml standard but may be documentation of libxml2
can help to solve this issue ? see http://xmlsoft.org/encoding.html

"What does this mean in practice for the libxml2 user:
* xmlChar, the libxml2 data type is a byte, those bytes must be
assembled as UTF-8 valid strings. The proper way to terminate an
xmlChar * string is simply to append 0 byte, as usual.
* One just need to make sure that when using chars outside the ASCII
set, the values has been properly converted to UTF-8"

I understand this as: all xmlChar strings must be in utf-8 encoding,
no matter what is encoding of xml body

i try to fix this issue for xpath function, see patch in attachment

by the way, contrib/xml2 also have this issue...
*** a/src/backend/utils/adt/xml.c
--- b/src/backend/utils/adt/xml.c
***************
*** 374,383 **** cstring_to_xmltype(const char *string)
  
  #ifdef USE_LIBXML
  static xmltype *
! xmlBuffer_to_xmltype(xmlBufferPtr buf)
  {
! 	return (xmltype *) cstring_to_text_with_len((char *) xmlBufferContent(buf),
! 												xmlBufferLength(buf));
  }
  #endif
  
--- 374,402 ----
  
  #ifdef USE_LIBXML
  static xmltype *
! xmlBuffer_to_xmltype(xmlBufferPtr buf, int need_encode)
  {
! 	const xmlChar *utf8str = xmlBufferContent(buf);
! 	int len = xmlBufferLength(buf);
! 	char *str = NULL;
! 	xmltype *res;
! 
! 	if (need_encode)
! 	{
! 		/* libxml2 internal encoding (utf8) to database encoding */
! 		str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str,
! 												 len,
! 												 PG_UTF8,
! 												 GetDatabaseEncoding());
! 		utf8str = (xmlChar *) str;
! 	}
! 
! 	res = (xmltype *) cstring_to_text_with_len((const char *) utf8str, len);
! 
! 	if (str && (char *) utf8str != str)
! 		pfree(str);
! 
! 	return res;
  }
  #endif
  
***************
*** 627,633 **** xmlelement(XmlExprState *xmlExpr, ExprContext *econtext)
  	xmlTextWriterEndElement(writer);
  	xmlFreeTextWriter(writer);
  
! 	result = xmlBuffer_to_xmltype(buf);
  	xmlBufferFree(buf);
  
  	return result;
--- 646,652 ----
  	xmlTextWriterEndElement(writer);
  	xmlFreeTextWriter(writer);
  
! 	result = xmlBuffer_to_xmltype(buf, 0);
  	xmlBufferFree(buf);
  
  	return result;
***************
*** 3152,3158 **** SPI_sql_row_to_xmlelement(int rownum, StringInfo result, char *tablename,
  static text *
  xml_xmlnodetoxmltype(xmlNodePtr cur)
  {
! 	xmlChar    *str;
  	xmltype    *result;
  	xmlBufferPtr buf;
  
--- 3171,3178 ----
  static text *
  xml_xmlnodetoxmltype(xmlNodePtr cur)
  {
! 	xmlChar    *utf8str;
! 	char       *str;
  	xmltype    *result;
  	xmlBufferPtr buf;
  
***************
*** 3160,3173 **** xml_xmlnodetoxmltype(xmlNodePtr cur)
  	{
  		buf = xmlBufferCreate();
  		xmlNodeDump(buf, NULL, cur, 0, 1);
! 		result = xmlBuffer_to_xmltype(buf);
  		xmlBufferFree(buf);
  	}
  	else
  	{
! 		str = xmlXPathCastNodeToString(cur);
! 		result = (xmltype *) cstring_to_text((char *) str);
! 		xmlFree(str);
  	}
  
  	return result;
--- 3180,3200 ----
  	{
  		buf = xmlBufferCreate();
  		xmlNodeDump(buf, NULL, cur, 0, 1);
! 		result = xmlBuffer_to_xmltype(buf, 1);
  		xmlBufferFree(buf);
  	}
  	else
  	{
! 		utf8str = xmlXPathCastNodeToString(cur);
! 		/* XPath result libxml2 internal encoding (utf8) to database encoding */
! 		str = (char *) pg_do_encoding_conversion(utf8str,
! 												 strlen((const char *) utf8str),
! 												 PG_UTF8,
! 												 GetDatabaseEncoding());
! 		result = (xmltype *) cstring_to_text(str);
! 		if ((xmlChar *) str != utf8str)
! 			pfree(str);
! 		xmlFree(utf8str);
  	}
  
  	return result;
***************
*** 3200,3208 **** xpath(PG_FUNCTION_ARGS)
--- 3227,3238 ----
  	xmlXPathObjectPtr xpathobj;
  	char	   *datastr;
  	int32		len;
+ 	int32		utf8string_len;
  	int32		xpath_len;
  	xmlChar    *string;
+ 	xmlChar    *utf8string;
  	xmlChar    *xpath_expr;
+ 	xmlChar    *utf8xpath_expr;
  	int			i;
  	int			res_nitems;
  	int			ndim;
***************
*** 3294,3299 **** xpath(PG_FUNCTION_ARGS)
--- 3324,3345 ----
  	xpath_expr[xpath_len + 2] = '\0';
  	xpath_len += 2;
  
+ 	/* xml body database encoding to libxml2 internal encoding (utf8) */
+ 	utf8string = pg_do_encoding_conversion(string,
+ 										   len,
+ 										   GetDatabaseEncoding(),
+ 										   PG_UTF8);
+ 	if (utf8string != string)
+ 		utf8string_len = strlen((char *) utf8string);
+ 	else
+ 		utf8string_len = len;
+ 
+ 	/* XPath expression database encoding to libxml2 internal encoding (utf8) */
+ 	utf8xpath_expr = pg_do_encoding_conversion(xpath_expr,
+ 											   xpath_len,
+ 											   GetDatabaseEncoding(),
+ 											   PG_UTF8);
+ 
  	xmlInitParser();
  
  	/*
***************
*** 3304,3310 **** xpath(PG_FUNCTION_ARGS)
  	if (ctxt == NULL)
  		xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY,
  					"could not allocate parser context");
! 	doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0);
  	if (doc == NULL)
  		xml_ereport(ERROR, ERRCODE_INVALID_XML_DOCUMENT,
  					"could not parse XML data");
--- 3350,3356 ----
  	if (ctxt == NULL)
  		xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY,
  					"could not allocate parser context");
! 	doc = xmlCtxtReadMemory(ctxt, (char *) utf8string, utf8string_len, NULL, NULL, 0);
  	if (doc == NULL)
  		xml_ereport(ERROR, ERRCODE_INVALID_XML_DOCUMENT,
  					"could not parse XML data");
***************
*** 3341,3347 **** xpath(PG_FUNCTION_ARGS)
  		}
  	}
  
! 	xpathcomp = xmlXPathCompile(xpath_expr);
  	if (xpathcomp == NULL)	/* TODO: show proper XPath error details */
  		xml_ereport(ERROR, ERRCODE_INTERNAL_ERROR,
  					"invalid XPath expression");
--- 3387,3393 ----
  		}
  	}
  
! 	xpathcomp = xmlXPathCompile(utf8xpath_expr);
  	if (xpathcomp == NULL)	/* TODO: show proper XPath error details */
  		xml_ereport(ERROR, ERRCODE_INTERNAL_ERROR,
  					"invalid XPath expression");
***************
*** 3366,3371 **** xpath(PG_FUNCTION_ARGS)
--- 3412,3421 ----
  			Datum		elem;
  			bool		elemisnull = false;
  
+ 			/*
+ 			 * XPath result libxml2 internal encoding (utf8) to database encoding
+ 			 * converted in xml_xmlnodetoxmltype
+ 			 */
  			elem = PointerGetDatum(xml_xmlnodetoxmltype(xpathobj->nodesetval->nodeTab[i]));
  			astate = accumArrayResult(astate, elem,
  									  elemisnull, XMLOID,
-- 
Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs

Reply via email to