23 января 2009 г. 0:58 пользователь Peter Eisentraut <pete...@gmx.net> написал: > On Thursday 22 January 2009 15:39:00 Sergey Burladyan wrote: >> seb=# select xpath('/русский/text()', v::xml) from (select >> xml('<русский>язык</русский>')) as x(v); >> ERROR: could not parse XML data >> DETAIL: Entity: line 1: parser error : Input is not proper UTF-8, indicate >> encoding ! >> Bytes: 0xF0 0xF3 0xF1 0xF1 >> <x><русский>язык</русский></x> >> ^
> This raises the question: What are the rules about encoding the characters in > XPath expressions themselves? I haven't found anything about that in the > standard. Anyone know? PostgreSQL does not use libxml2 internal encoding support and strip xml encoding from xml body, so i think there is no choice, by default for libxml2 it must be in it internal encoding utf-8 anyway. i am not sure about xml standard but may be documentation of libxml2 can help to solve this issue ? see http://xmlsoft.org/encoding.html "What does this mean in practice for the libxml2 user: * xmlChar, the libxml2 data type is a byte, those bytes must be assembled as UTF-8 valid strings. The proper way to terminate an xmlChar * string is simply to append 0 byte, as usual. * One just need to make sure that when using chars outside the ASCII set, the values has been properly converted to UTF-8" I understand this as: all xmlChar strings must be in utf-8 encoding, no matter what is encoding of xml body i try to fix this issue for xpath function, see patch in attachment by the way, contrib/xml2 also have this issue...
*** a/src/backend/utils/adt/xml.c --- b/src/backend/utils/adt/xml.c *************** *** 374,383 **** cstring_to_xmltype(const char *string) #ifdef USE_LIBXML static xmltype * ! xmlBuffer_to_xmltype(xmlBufferPtr buf) { ! return (xmltype *) cstring_to_text_with_len((char *) xmlBufferContent(buf), ! xmlBufferLength(buf)); } #endif --- 374,402 ---- #ifdef USE_LIBXML static xmltype * ! xmlBuffer_to_xmltype(xmlBufferPtr buf, int need_encode) { ! const xmlChar *utf8str = xmlBufferContent(buf); ! int len = xmlBufferLength(buf); ! char *str = NULL; ! xmltype *res; ! ! if (need_encode) ! { ! /* libxml2 internal encoding (utf8) to database encoding */ ! str = (char *) pg_do_encoding_conversion((unsigned char *) utf8str, ! len, ! PG_UTF8, ! GetDatabaseEncoding()); ! utf8str = (xmlChar *) str; ! } ! ! res = (xmltype *) cstring_to_text_with_len((const char *) utf8str, len); ! ! if (str && (char *) utf8str != str) ! pfree(str); ! ! return res; } #endif *************** *** 627,633 **** xmlelement(XmlExprState *xmlExpr, ExprContext *econtext) xmlTextWriterEndElement(writer); xmlFreeTextWriter(writer); ! result = xmlBuffer_to_xmltype(buf); xmlBufferFree(buf); return result; --- 646,652 ---- xmlTextWriterEndElement(writer); xmlFreeTextWriter(writer); ! result = xmlBuffer_to_xmltype(buf, 0); xmlBufferFree(buf); return result; *************** *** 3152,3158 **** SPI_sql_row_to_xmlelement(int rownum, StringInfo result, char *tablename, static text * xml_xmlnodetoxmltype(xmlNodePtr cur) { ! xmlChar *str; xmltype *result; xmlBufferPtr buf; --- 3171,3178 ---- static text * xml_xmlnodetoxmltype(xmlNodePtr cur) { ! xmlChar *utf8str; ! char *str; xmltype *result; xmlBufferPtr buf; *************** *** 3160,3173 **** xml_xmlnodetoxmltype(xmlNodePtr cur) { buf = xmlBufferCreate(); xmlNodeDump(buf, NULL, cur, 0, 1); ! result = xmlBuffer_to_xmltype(buf); xmlBufferFree(buf); } else { ! str = xmlXPathCastNodeToString(cur); ! result = (xmltype *) cstring_to_text((char *) str); ! xmlFree(str); } return result; --- 3180,3200 ---- { buf = xmlBufferCreate(); xmlNodeDump(buf, NULL, cur, 0, 1); ! result = xmlBuffer_to_xmltype(buf, 1); xmlBufferFree(buf); } else { ! utf8str = xmlXPathCastNodeToString(cur); ! /* XPath result libxml2 internal encoding (utf8) to database encoding */ ! str = (char *) pg_do_encoding_conversion(utf8str, ! strlen((const char *) utf8str), ! PG_UTF8, ! GetDatabaseEncoding()); ! result = (xmltype *) cstring_to_text(str); ! if ((xmlChar *) str != utf8str) ! pfree(str); ! xmlFree(utf8str); } return result; *************** *** 3200,3208 **** xpath(PG_FUNCTION_ARGS) --- 3227,3238 ---- xmlXPathObjectPtr xpathobj; char *datastr; int32 len; + int32 utf8string_len; int32 xpath_len; xmlChar *string; + xmlChar *utf8string; xmlChar *xpath_expr; + xmlChar *utf8xpath_expr; int i; int res_nitems; int ndim; *************** *** 3294,3299 **** xpath(PG_FUNCTION_ARGS) --- 3324,3345 ---- xpath_expr[xpath_len + 2] = '\0'; xpath_len += 2; + /* xml body database encoding to libxml2 internal encoding (utf8) */ + utf8string = pg_do_encoding_conversion(string, + len, + GetDatabaseEncoding(), + PG_UTF8); + if (utf8string != string) + utf8string_len = strlen((char *) utf8string); + else + utf8string_len = len; + + /* XPath expression database encoding to libxml2 internal encoding (utf8) */ + utf8xpath_expr = pg_do_encoding_conversion(xpath_expr, + xpath_len, + GetDatabaseEncoding(), + PG_UTF8); + xmlInitParser(); /* *************** *** 3304,3310 **** xpath(PG_FUNCTION_ARGS) if (ctxt == NULL) xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); ! doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0); if (doc == NULL) xml_ereport(ERROR, ERRCODE_INVALID_XML_DOCUMENT, "could not parse XML data"); --- 3350,3356 ---- if (ctxt == NULL) xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY, "could not allocate parser context"); ! doc = xmlCtxtReadMemory(ctxt, (char *) utf8string, utf8string_len, NULL, NULL, 0); if (doc == NULL) xml_ereport(ERROR, ERRCODE_INVALID_XML_DOCUMENT, "could not parse XML data"); *************** *** 3341,3347 **** xpath(PG_FUNCTION_ARGS) } } ! xpathcomp = xmlXPathCompile(xpath_expr); if (xpathcomp == NULL) /* TODO: show proper XPath error details */ xml_ereport(ERROR, ERRCODE_INTERNAL_ERROR, "invalid XPath expression"); --- 3387,3393 ---- } } ! xpathcomp = xmlXPathCompile(utf8xpath_expr); if (xpathcomp == NULL) /* TODO: show proper XPath error details */ xml_ereport(ERROR, ERRCODE_INTERNAL_ERROR, "invalid XPath expression"); *************** *** 3366,3371 **** xpath(PG_FUNCTION_ARGS) --- 3412,3421 ---- Datum elem; bool elemisnull = false; + /* + * XPath result libxml2 internal encoding (utf8) to database encoding + * converted in xml_xmlnodetoxmltype + */ elem = PointerGetDatum(xml_xmlnodetoxmltype(xpathobj->nodesetval->nodeTab[i])); astate = accumArrayResult(astate, elem, elemisnull, XMLOID,
-- Sent via pgsql-bugs mailing list (pgsql-bugs@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-bugs