23 января 2009 г. 0:58 пользователь Peter Eisentraut написал:
> On Thursday 22 January 2009 15:39:00 Sergey Burladyan wrote:
>> seb=# select xpath('/русский/text()', v::xml) from (select
>> xml('<русский>язык')) as x(v);
>> ERROR: could not parse XML data
>> DETAIL: Entity: line 1: parser error : Input is not proper UTF-8, indicate
>> encoding !
>> Bytes: 0xF0 0xF3 0xF1 0xF1
>> <русский>язык
>> ^
> This raises the question: What are the rules about encoding the characters in
> XPath expressions themselves? I haven't found anything about that in the
> standard. Anyone know?
PostgreSQL does not use libxml2 internal encoding support and strip
xml encoding from xml body, so i think there is no choice, by default
for libxml2 it must be in it internal encoding utf-8 anyway.
i am not sure about xml standard but may be documentation of libxml2
can help to solve this issue ? see http://xmlsoft.org/encoding.html
"What does this mean in practice for the libxml2 user:
* xmlChar, the libxml2 data type is a byte, those bytes must be
assembled as UTF-8 valid strings. The proper way to terminate an
xmlChar * string is simply to append 0 byte, as usual.
* One just need to make sure that when using chars outside the ASCII
set, the values has been properly converted to UTF-8"
I understand this as: all xmlChar strings must be in utf-8 encoding,
no matter what is encoding of xml body
i try to fix this issue for xpath function, see patch in attachment
by the way, contrib/xml2 also have this issue...
*** a/src/backend/utils/adt/xml.c
--- b/src/backend/utils/adt/xml.c
***
*** 374,383 cstring_to_xmltype(const char *string)
#ifdef USE_LIBXML
static xmltype *
! xmlBuffer_to_xmltype(xmlBufferPtr buf)
{
! return (xmltype *) cstring_to_text_with_len((char *) xmlBufferContent(buf),
! xmlBufferLength(buf));
}
#endif
--- 374,402
#ifdef USE_LIBXML
static xmltype *
! xmlBuffer_to_xmltype(xmlBufferPtr buf, int need_encode)
{
! const xmlChar *utf8str = xmlBufferContent(buf);
! int len = xmlBufferLength(buf);
! char *str = NULL;
! xmltype *res;
!
! if (need_encode)
! {
! /* libxml2 internal encoding (utf8) to database encoding */
! str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str,
! len,
! PG_UTF8,
! GetDatabaseEncoding());
! utf8str = (xmlChar *) str;
! }
!
! res = (xmltype *) cstring_to_text_with_len((const char *) utf8str, len);
!
! if (str && (char *) utf8str != str)
! pfree(str);
!
! return res;
}
#endif
***
*** 627,633 xmlelement(XmlExprState *xmlExpr, ExprContext *econtext)
xmlTextWriterEndElement(writer);
xmlFreeTextWriter(writer);
! result = xmlBuffer_to_xmltype(buf);
xmlBufferFree(buf);
return result;
--- 646,652
xmlTextWriterEndElement(writer);
xmlFreeTextWriter(writer);
! result = xmlBuffer_to_xmltype(buf, 0);
xmlBufferFree(buf);
return result;
***
*** 3152,3158 SPI_sql_row_to_xmlelement(int rownum, StringInfo result, char *tablename,
static text *
xml_xmlnodetoxmltype(xmlNodePtr cur)
{
! xmlChar*str;
xmltype*result;
xmlBufferPtr buf;
--- 3171,3178
static text *
xml_xmlnodetoxmltype(xmlNodePtr cur)
{
! xmlChar*utf8str;
! char *str;
xmltype*result;
xmlBufferPtr buf;
***
*** 3160,3173 xml_xmlnodetoxmltype(xmlNodePtr cur)
{
buf = xmlBufferCreate();
xmlNodeDump(buf, NULL, cur, 0, 1);
! result = xmlBuffer_to_xmltype(buf);
xmlBufferFree(buf);
}
else
{
! str = xmlXPathCastNodeToString(cur);
! result = (xmltype *) cstring_to_text((char *) str);
! xmlFree(str);
}
return result;
--- 3180,3200
{
buf = xmlBufferCreate();
xmlNodeDump(buf, NULL, cur, 0, 1);
! result = xmlBuffer_to_xmltype(buf, 1);
xmlBufferFree(buf);
}
else
{
! utf8str = xmlXPathCastNodeToString(cur);
! /* XPath result libxml2 internal encoding (utf8) to database encoding */
! str = (char *) pg_do_encoding_conversion(utf8str,
! strlen((const char *) utf8str),
! PG_UTF8,
! GetDatabaseEncoding());
! result = (xmltype *) cstring_to_text(str);
! if ((xmlChar *) str != utf8str)
! pfree(str);
! xmlFree(utf8str);
}
return result;
***
*** 3200,3208 xpath(PG_FUNCTION_ARGS)
--- 3227,3238
xmlXPathObjectPtr xpathobj;
char *datastr;
int32 len;
+ int32 utf8string_len;
int32 xpath_len;
xmlChar*string;
+ xmlChar*utf8string;
xmlChar*xpath_expr;
+ xmlChar*utf8xpath_expr;
int i;
int res_nitems;
int ndim;
***
*** 3294,3299 xpath(PG_FUNCTION_ARGS)
--- 3324,3345
xpath_expr[xpath_len + 2] = '\0';
xpath_len += 2;
+ /* xml body database encoding to libxml2 internal encoding (utf8) */
+ utf8string = pg_do_encoding_conversion(string,
+ len