23 января 2009 г. 0:58 пользователь Peter Eisentraut <[email protected]> написал:
> On Thursday 22 January 2009 15:39:00 Sergey Burladyan wrote:
>> seb=# select xpath('/русский/text()', v::xml) from (select
>> xml('<русский>язык</русский>')) as x(v);
>> ERROR: could not parse XML data
>> DETAIL: Entity: line 1: parser error : Input is not proper UTF-8, indicate
>> encoding !
>> Bytes: 0xF0 0xF3 0xF1 0xF1
>> <x><русский>язык</русский></x>
>> ^
> This raises the question: What are the rules about encoding the characters in
> XPath expressions themselves? I haven't found anything about that in the
> standard. Anyone know?
PostgreSQL does not use libxml2 internal encoding support and strip
xml encoding from xml body, so i think there is no choice, by default
for libxml2 it must be in it internal encoding utf-8 anyway.
i am not sure about xml standard but may be documentation of libxml2
can help to solve this issue ? see http://xmlsoft.org/encoding.html
"What does this mean in practice for the libxml2 user:
* xmlChar, the libxml2 data type is a byte, those bytes must be
assembled as UTF-8 valid strings. The proper way to terminate an
xmlChar * string is simply to append 0 byte, as usual.
* One just need to make sure that when using chars outside the ASCII
set, the values has been properly converted to UTF-8"
I understand this as: all xmlChar strings must be in utf-8 encoding,
no matter what is encoding of xml body
i try to fix this issue for xpath function, see patch in attachment
by the way, contrib/xml2 also have this issue...
*** a/src/backend/utils/adt/xml.c
--- b/src/backend/utils/adt/xml.c
***************
*** 374,383 **** cstring_to_xmltype(const char *string)
#ifdef USE_LIBXML
static xmltype *
! xmlBuffer_to_xmltype(xmlBufferPtr buf)
{
! return (xmltype *) cstring_to_text_with_len((char *) xmlBufferContent(buf),
! xmlBufferLength(buf));
}
#endif
--- 374,402 ----
#ifdef USE_LIBXML
static xmltype *
! xmlBuffer_to_xmltype(xmlBufferPtr buf, int need_encode)
{
! const xmlChar *utf8str = xmlBufferContent(buf);
! int len = xmlBufferLength(buf);
! char *str = NULL;
! xmltype *res;
!
! if (need_encode)
! {
! /* libxml2 internal encoding (utf8) to database encoding */
! str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str,
! len,
! PG_UTF8,
! GetDatabaseEncoding());
! utf8str = (xmlChar *) str;
! }
!
! res = (xmltype *) cstring_to_text_with_len((const char *) utf8str, len);
!
! if (str && (char *) utf8str != str)
! pfree(str);
!
! return res;
}
#endif
***************
*** 627,633 **** xmlelement(XmlExprState *xmlExpr, ExprContext *econtext)
xmlTextWriterEndElement(writer);
xmlFreeTextWriter(writer);
! result = xmlBuffer_to_xmltype(buf);
xmlBufferFree(buf);
return result;
--- 646,652 ----
xmlTextWriterEndElement(writer);
xmlFreeTextWriter(writer);
! result = xmlBuffer_to_xmltype(buf, 0);
xmlBufferFree(buf);
return result;
***************
*** 3152,3158 **** SPI_sql_row_to_xmlelement(int rownum, StringInfo result, char *tablename,
static text *
xml_xmlnodetoxmltype(xmlNodePtr cur)
{
! xmlChar *str;
xmltype *result;
xmlBufferPtr buf;
--- 3171,3178 ----
static text *
xml_xmlnodetoxmltype(xmlNodePtr cur)
{
! xmlChar *utf8str;
! char *str;
xmltype *result;
xmlBufferPtr buf;
***************
*** 3160,3173 **** xml_xmlnodetoxmltype(xmlNodePtr cur)
{
buf = xmlBufferCreate();
xmlNodeDump(buf, NULL, cur, 0, 1);
! result = xmlBuffer_to_xmltype(buf);
xmlBufferFree(buf);
}
else
{
! str = xmlXPathCastNodeToString(cur);
! result = (xmltype *) cstring_to_text((char *) str);
! xmlFree(str);
}
return result;
--- 3180,3200 ----
{
buf = xmlBufferCreate();
xmlNodeDump(buf, NULL, cur, 0, 1);
! result = xmlBuffer_to_xmltype(buf, 1);
xmlBufferFree(buf);
}
else
{
! utf8str = xmlXPathCastNodeToString(cur);
! /* XPath result libxml2 internal encoding (utf8) to database encoding */
! str = (char *) pg_do_encoding_conversion(utf8str,
! strlen((const char *) utf8str),
! PG_UTF8,
! GetDatabaseEncoding());
! result = (xmltype *) cstring_to_text(str);
! if ((xmlChar *) str != utf8str)
! pfree(str);
! xmlFree(utf8str);
}
return result;
***************
*** 3200,3208 **** xpath(PG_FUNCTION_ARGS)
--- 3227,3238 ----
xmlXPathObjectPtr xpathobj;
char *datastr;
int32 len;
+ int32 utf8string_len;
int32 xpath_len;
xmlChar *string;
+ xmlChar *utf8string;
xmlChar *xpath_expr;
+ xmlChar *utf8xpath_expr;
int i;
int res_nitems;
int ndim;
***************
*** 3294,3299 **** xpath(PG_FUNCTION_ARGS)
--- 3324,3345 ----
xpath_expr[xpath_len + 2] = '\0';
xpath_len += 2;
+ /* xml body database encoding to libxml2 internal encoding (utf8) */
+ utf8string = pg_do_encoding_conversion(string,
+ len,
+ GetDatabaseEncoding(),
+ PG_UTF8);
+ if (utf8string != string)
+ utf8string_len = strlen((char *) utf8string);
+ else
+ utf8string_len = len;
+
+ /* XPath expression database encoding to libxml2 internal encoding (utf8) */
+ utf8xpath_expr = pg_do_encoding_conversion(xpath_expr,
+ xpath_len,
+ GetDatabaseEncoding(),
+ PG_UTF8);
+
xmlInitParser();
/*
***************
*** 3304,3310 **** xpath(PG_FUNCTION_ARGS)
if (ctxt == NULL)
xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY,
"could not allocate parser context");
! doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0);
if (doc == NULL)
xml_ereport(ERROR, ERRCODE_INVALID_XML_DOCUMENT,
"could not parse XML data");
--- 3350,3356 ----
if (ctxt == NULL)
xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY,
"could not allocate parser context");
! doc = xmlCtxtReadMemory(ctxt, (char *) utf8string, utf8string_len, NULL, NULL, 0);
if (doc == NULL)
xml_ereport(ERROR, ERRCODE_INVALID_XML_DOCUMENT,
"could not parse XML data");
***************
*** 3341,3347 **** xpath(PG_FUNCTION_ARGS)
}
}
! xpathcomp = xmlXPathCompile(xpath_expr);
if (xpathcomp == NULL) /* TODO: show proper XPath error details */
xml_ereport(ERROR, ERRCODE_INTERNAL_ERROR,
"invalid XPath expression");
--- 3387,3393 ----
}
}
! xpathcomp = xmlXPathCompile(utf8xpath_expr);
if (xpathcomp == NULL) /* TODO: show proper XPath error details */
xml_ereport(ERROR, ERRCODE_INTERNAL_ERROR,
"invalid XPath expression");
***************
*** 3366,3371 **** xpath(PG_FUNCTION_ARGS)
--- 3412,3421 ----
Datum elem;
bool elemisnull = false;
+ /*
+ * XPath result libxml2 internal encoding (utf8) to database encoding
+ * converted in xml_xmlnodetoxmltype
+ */
elem = PointerGetDatum(xml_xmlnodetoxmltype(xpathobj->nodesetval->nodeTab[i]));
astate = accumArrayResult(astate, elem,
elemisnull, XMLOID,
--
Sent via pgsql-bugs mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-bugs