Diff
Modified: trunk/LayoutTests/ChangeLog (140609 => 140610)
--- trunk/LayoutTests/ChangeLog 2013-01-24 00:21:18 UTC (rev 140609)
+++ trunk/LayoutTests/ChangeLog 2013-01-24 00:26:32 UTC (rev 140610)
@@ -1,3 +1,16 @@
+2013-01-23 Martin Robinson <[email protected]>
+
+ WebKit should support decoding multi-byte entities in XML content
+ https://bugs.webkit.org/show_bug.cgi?id=107459
+
+ Reviewed by Adam Barth.
+
+ Add a simple test for decoding some entities that resolve to multiple
+ bytes in XML.
+
+ * fast/parser/entities-in-xhtml.xhtml: Added a few multi-bye entities.
+ * fast/parser/entities-in-xhtml-expected.txt: Updated.
+
2013-01-23 Filip Pizlo <[email protected]>
Constant folding an access to an uncaptured variable that is captured later in the same basic block shouldn't lead to assertion failures
Modified: trunk/LayoutTests/fast/parser/entities-in-xhtml-expected.txt (140609 => 140610)
--- trunk/LayoutTests/fast/parser/entities-in-xhtml-expected.txt 2013-01-24 00:21:18 UTC (rev 140609)
+++ trunk/LayoutTests/fast/parser/entities-in-xhtml-expected.txt 2013-01-24 00:26:32 UTC (rev 140610)
@@ -266,3 +266,9 @@
9827 2663 clubs ♣ ♣ ♣
9829 2665 hearts ♥ ♥ ♥
9830 2666 diams ♦ ♦ ♦
+Multi-byte entities from HTML5
+
+decimal hexadecimal entity name &#nnn; &#xhhh; &entity;
+8882 22B2 vltri ⊲ ⊲ ⊲
+8834 + 8402 2282 + 20D2 sub ⊂⃒ ⊂⃒ ⊂⃒
+8804 + 8402 2264 + 20D2 sub ≤⃒ ≤⃒ ≤⃒
Modified: trunk/LayoutTests/fast/parser/entities-in-xhtml.xhtml (140609 => 140610)
--- trunk/LayoutTests/fast/parser/entities-in-xhtml.xhtml 2013-01-24 00:21:18 UTC (rev 140609)
+++ trunk/LayoutTests/fast/parser/entities-in-xhtml.xhtml 2013-01-24 00:26:32 UTC (rev 140610)
@@ -332,7 +332,28 @@
<tr><td>9827</td><td>2663</td><td>clubs</td><td>♣</td><td>♣</td><td>♣</td></tr>
<tr><td>9829</td><td>2665</td><td>hearts</td><td>♥</td><td>♥</td><td>♥</td></tr>
<tr><td>9830</td><td>2666</td><td>diams</td><td>♦</td><td>♦</td><td>♦</td></tr>
+
</tbody>
</table>
+
+<h2>Multi-byte entities from HTML5</h2>
+<table border="1" summary="this table lists decimal value, hexadecimal value,
+entity name and respective entity references of XHTML entities.">
+<thead>
+<tr>
+<th>decimal</th>
+<th>hexadecimal</th>
+<th>entity name</th>
+<th>&#nnn;</th>
+<th>&#xhhh;</th>
+<th>&entity;</th>
+</tr>
+</thead>
+<tbody>
+<tr><td>8882</td><td>22B2</td><td>vltri</td><td>⊲</td><td>⊲</td><td>⊲</td></tr>
+<tr><td>8834 + 8402</td><td>2282 + 20D2</td><td>sub</td><td>⊂⃒</td><td>⊂⃒</td><td>⊂⃒</td></tr>
+<tr><td>8804 + 8402</td><td>2264 + 20D2</td><td>sub</td><td>≤⃒</td><td>≤⃒</td><td>≤⃒</td></tr>
+</tbody>
+</table>
</body>
</html>
Modified: trunk/Source/WebCore/ChangeLog (140609 => 140610)
--- trunk/Source/WebCore/ChangeLog 2013-01-24 00:21:18 UTC (rev 140609)
+++ trunk/Source/WebCore/ChangeLog 2013-01-24 00:26:32 UTC (rev 140610)
@@ -1,3 +1,27 @@
+2013-01-23 Martin Robinson <[email protected]>
+
+ WebKit should support decoding multi-byte entities in XML content
+ https://bugs.webkit.org/show_bug.cgi?id=107459
+
+ Reviewed by Adam Barth.
+
+ Test: fast/parser/entities-in-xhtml.xhtml
+
+ * html/parser/HTMLEntityParser.cpp:
+ (WebCore::appendUChar32ToUCharArray): Added this helper function. Later patches
+ may try to move this code to somewhere that it can be shared more easily.
+ (WebCore::decodeNamedEntityToUCharArray): Modify this function to work on a UChar
+ array four elements long, so that multi-byte and multi-character entities can be resolved.
+ * html/parser/HTMLEntityParser.h: Updated function declaratoin.
+ * xml/parser/XMLDocumentParserLibxml2.cpp:
+ (WebCore): Modify the statically allocated entity string memory area to accommodate
+ up to two UTF-8 characters. Each UTF-8 character can be 4 bytes, so this brings the
+ total size to 9 bytes.
+ (WebCore::getXHTMLEntity): Use the new entity decoding API.
+ * xml/parser/XMLDocumentParserQt.cpp:
+ (WebCore::EntityResolver::resolveUndeclaredEntity): Ditto.
+ (WebCore::XMLDocumentParser::parse): Ditto.
+
2013-01-23 Eric Seidel <[email protected]>
Stop the background-parser during HTMLDocumentParser::detatch to prevent crashes/asserts
Modified: trunk/Source/WebCore/html/parser/HTMLEntityParser.cpp (140609 => 140610)
--- trunk/Source/WebCore/html/parser/HTMLEntityParser.cpp 2013-01-24 00:21:18 UTC (rev 140609)
+++ trunk/Source/WebCore/html/parser/HTMLEntityParser.cpp 2013-01-24 00:26:32 UTC (rev 140610)
@@ -140,8 +140,22 @@
return consumeCharacterReference<HTMLEntityParser>(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter);
}
-UChar decodeNamedEntity(const char* name)
+static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
{
+ if (U_IS_BMP(value)) {
+ UChar character = static_cast<UChar>(value);
+ ASSERT(character == value);
+ result[0] = character;
+ return 1;
+ }
+
+ result[0] = U16_LEAD(value);
+ result[1] = U16_TRAIL(value);
+ return 2;
+}
+
+size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
+{
HTMLEntitySearch search;
while (*name) {
search.advance(*name++);
@@ -152,13 +166,10 @@
if (!search.isEntityPrefix())
return 0;
- UChar32 firstValue = search.mostRecentMatch()->firstValue;
- if (U16_LENGTH(firstValue) != 1 || search.mostRecentMatch()->secondValue) {
- // FIXME: Callers need to move off this API. Not all entities can be
- // represented in a single UChar!
- return 0;
- }
- return static_cast<UChar>(firstValue);
+ size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
+ if (!search.mostRecentMatch()->secondValue)
+ return numberOfCodePoints;
+ return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
}
} // namespace WebCore
Modified: trunk/Source/WebCore/html/parser/HTMLEntityParser.h (140609 => 140610)
--- trunk/Source/WebCore/html/parser/HTMLEntityParser.h 2013-01-24 00:21:18 UTC (rev 140609)
+++ trunk/Source/WebCore/html/parser/HTMLEntityParser.h 2013-01-24 00:26:32 UTC (rev 140610)
@@ -34,8 +34,7 @@
bool consumeHTMLEntity(SegmentedString&, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter = '\0');
// Used by the XML parser. Not suitable for use in HTML parsing. Use consumeHTMLEntity instead.
-// FIXME: Move the XML parser to an entity decoding function works for non-BMP characters!
-UChar decodeNamedEntity(const char*);
+size_t decodeNamedEntityToUCharArray(const char*, UChar result[4]);
}
Modified: trunk/Source/WebCore/xml/parser/XMLDocumentParserLibxml2.cpp (140609 => 140610)
--- trunk/Source/WebCore/xml/parser/XMLDocumentParserLibxml2.cpp 2013-01-24 00:21:18 UTC (rev 140609)
+++ trunk/Source/WebCore/xml/parser/XMLDocumentParserLibxml2.cpp 2013-01-24 00:26:32 UTC (rev 140610)
@@ -62,6 +62,7 @@
#include <wtf/Threading.h>
#include <wtf/UnusedParam.h>
#include <wtf/Vector.h>
+#include <wtf/unicode/UTF8.h>
#if ENABLE(XSLT)
#include "XMLTreeViewer.h"
@@ -1160,7 +1161,7 @@
// Using a static entity and marking it XML_INTERNAL_PREDEFINED_ENTITY is
// a hack to avoid malloc/free. Using a global variable like this could cause trouble
// if libxml implementation details were to change
-static xmlChar sharedXHTMLEntityResult[5] = {0, 0, 0, 0, 0};
+static xmlChar sharedXHTMLEntityResult[9] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
static xmlEntityPtr sharedXHTMLEntity()
{
@@ -1174,19 +1175,36 @@
return &entity;
}
+static size_t convertUTF16EntityToUTF8(const UChar* utf16Entity, size_t numberOfCodeUnits, char* target, size_t targetSize)
+{
+ const char* originalTarget = target;
+ WTF::Unicode::ConversionResult conversionResult = WTF::Unicode::convertUTF16ToUTF8(&utf16Entity,
+ utf16Entity + numberOfCodeUnits, &target, target + targetSize);
+ if (conversionResult != WTF::Unicode::conversionOK)
+ return 0;
+
+ // Even though we must pass the length, libxml expects the entity string to be null terminated.
+ ASSERT(target > originalTarget + 1);
+ *target = '\0';
+ return target - originalTarget;
+}
+
static xmlEntityPtr getXHTMLEntity(const xmlChar* name)
{
- UChar c = decodeNamedEntity(reinterpret_cast<const char*>(name));
- if (!c)
+ UChar utf16DecodedEntity[4];
+ size_t numberOfCodeUnits = decodeNamedEntityToUCharArray(reinterpret_cast<const char*>(name), utf16DecodedEntity);
+ if (!numberOfCodeUnits)
return 0;
- CString value = String(&c, 1).utf8();
- ASSERT(value.length() < 5);
+ ASSERT(numberOfCodeUnits <= 4);
+ size_t entityLengthInUTF8 = convertUTF16EntityToUTF8(utf16DecodedEntity, numberOfCodeUnits,
+ reinterpret_cast<char*>(sharedXHTMLEntityResult), WTF_ARRAY_LENGTH(sharedXHTMLEntityResult));
+ if (!entityLengthInUTF8)
+ return 0;
+
xmlEntityPtr entity = sharedXHTMLEntity();
- entity->length = value.length();
+ entity->length = entityLengthInUTF8;
entity->name = name;
- memcpy(sharedXHTMLEntityResult, value.data(), entity->length + 1);
-
return entity;
}
Modified: trunk/Source/WebCore/xml/parser/XMLDocumentParserQt.cpp (140609 => 140610)
--- trunk/Source/WebCore/xml/parser/XMLDocumentParserQt.cpp 2013-01-24 00:21:18 UTC (rev 140609)
+++ trunk/Source/WebCore/xml/parser/XMLDocumentParserQt.cpp 2013-01-24 00:26:32 UTC (rev 140610)
@@ -67,10 +67,16 @@
virtual QString resolveUndeclaredEntity(const QString &name);
};
+static QString decodeNamedEntity(const QString& entityName)
+{
+ UChar utf16DecodedEntity[4];
+ size_t numberOfCodePoints = decodeNamedEntityToUCharArray(entityName.toUtf8().constData(), utf16DecodedEntity);
+ return QString::fromUTF16(utf16DecodedEntity, numberOfCodePoints);
+}
+
QString EntityResolver::resolveUndeclaredEntity(const QString &name)
{
- UChar c = decodeNamedEntity(name.toUtf8().constData());
- return QString(c);
+ return decodeNamedEntity(name);
}
// --------------------------------
@@ -394,13 +400,11 @@
// <<", t = "<<m_stream.text().toString();
if (isXHTMLDocument()) {
QString entity = m_stream.name().toString();
- UChar c = decodeNamedEntity(entity.toUtf8().constData());
if (!m_leafTextNode)
enterText();
ExceptionCode ec = 0;
- String str(&c, 1);
// qDebug()<<" ------- adding entity "<<str;
- m_leafTextNode->appendData(str, ec);
+ m_leafTextNode->appendData(decodeNamedEntity(entity), ec);
}
break;
}