Title: [197534] trunk
Revision
197534
Author
msab...@apple.com
Date
2016-03-03 17:24:28 -0800 (Thu, 03 Mar 2016)

Log Message

[ES6] Make Unicode RegExp pattern parsing conform to the spec
https://bugs.webkit.org/show_bug.cgi?id=154988

Reviewed by Benjamin Poulain.

Source/_javascript_Core:

Updated RegExp pattern processing with 'u' (Unicode) flag to conform to the
spec (https://tc39.github.io/ecma262/2016/#sec-patterns).  In the spec, the
grammar is annotated with [U] annotations.  Productions that are prefixed with
[+U] are only available with the Unicode flags while productions prefixed with
[~U] are only available without the Unicode flag.
        
Added flags argument to Yarr::checkSyntax() so we can catch Unicode flag related
parsing errors at syntax checking time.  Restricted what escapes are available for
non Unicode patterns.  Most of this is defined in the IdentityEscape rule in the
pattern grammar.

Added \- as a CharacterClass only escape in Unicode patterns.

Updated the tests for these changes.

Made changes suggested in https://bugs.webkit.org/show_bug.cgi?id=154842#c22 after
change set r197426 was landed.

* parser/ASTBuilder.h:
(JSC::ASTBuilder::createRegExp):
* parser/Parser.cpp:
(JSC::Parser<LexerType>::parsePrimaryExpression):
* parser/SyntaxChecker.h:
(JSC::SyntaxChecker::createRegExp):
* yarr/YarrInterpreter.cpp:
(JSC::Yarr::Interpreter::InputStream::readChecked):
(JSC::Yarr::Interpreter::InputStream::readSurrogatePairChecked):
(JSC::Yarr::Interpreter::InputStream::reread):
(JSC::Yarr::Interpreter::InputStream::uncheckInput):
(JSC::Yarr::Interpreter::InputStream::atStart):
(JSC::Yarr::Interpreter::InputStream::atEnd):
(JSC::Yarr::Interpreter::testCharacterClass):
(JSC::Yarr::Interpreter::backtrackPatternCharacter):
(JSC::Yarr::Interpreter::matchDisjunction):
(JSC::Yarr::ByteCompiler::atomPatternCharacter):
* yarr/YarrParser.h:
(JSC::Yarr::Parser::Parser):
(JSC::Yarr::Parser::isIdentityEscapeAnError):
(JSC::Yarr::Parser::parseEscape):
(JSC::Yarr::Parser::parse):
* yarr/YarrPattern.cpp:
(JSC::Yarr::CharacterClassConstructor::putChar):
(JSC::Yarr::CharacterClassConstructor::putRange):
(JSC::Yarr::CharacterClassConstructor::addSorted):
(JSC::Yarr::YarrPatternConstructor::setupAlternativeOffsets):
* yarr/YarrSyntaxChecker.cpp:
(JSC::Yarr::SyntaxChecker::disjunction):
(JSC::Yarr::checkSyntax):
* yarr/YarrSyntaxChecker.h:

LayoutTests:

Added tests cases.

* js/regexp-unicode-expected.txt:
* js/script-tests/regexp-unicode.js:
(shouldThrowInvalidEscape):


[ES6] Add support for Symbol.toPrimitive
https://bugs.webkit.org/show_bug.cgi?id=154877

Reviewed by Saam Barati.

Update test for Symbol.toPrimitive.

* js/Object-getOwnPropertyNames-expected.txt:
* js/script-tests/Object-getOwnPropertyNames.js:

Modified Paths

Diff

Modified: trunk/LayoutTests/ChangeLog (197533 => 197534)


--- trunk/LayoutTests/ChangeLog	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/LayoutTests/ChangeLog	2016-03-04 01:24:28 UTC (rev 197534)
@@ -1,5 +1,17 @@
-2016-03-03  Keith Miller  <keith_mil...@apple.com>
+2016-03-03  Michael Saboff  <msab...@apple.com>
 
+        [ES6] Make Unicode RegExp pattern parsing conform to the spec
+        https://bugs.webkit.org/show_bug.cgi?id=154988
+
+        Reviewed by Benjamin Poulain.
+
+        Added tests cases.
+
+        * js/regexp-unicode-expected.txt:
+        * js/script-tests/regexp-unicode.js:
+        (shouldThrowInvalidEscape):
+
+
         [ES6] Add support for Symbol.toPrimitive
         https://bugs.webkit.org/show_bug.cgi?id=154877
 

Modified: trunk/LayoutTests/js/regexp-unicode-expected.txt (197533 => 197534)


--- trunk/LayoutTests/js/regexp-unicode-expected.txt	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/LayoutTests/js/regexp-unicode-expected.txt	2016-03-04 01:24:28 UTC (rev 197534)
@@ -3,19 +3,19 @@
 On success, you will see a series of "PASS" messages, followed by "TEST COMPLETE".
 
 
-PASS "a".match(/a/)[0].length is 1
-PASS "a".match(/A/i)[0].length is 1
 PASS "a".match(/a/u)[0].length is 1
+PASS "a".match(/A/ui)[0].length is 1
+PASS "a".match(/a/u)[0].length is 1
 PASS "a".match(/A/iu)[0].length is 1
-PASS "Ȓ".match(/Ȓ/)[0].length is 1
 PASS "Ȓ".match(/Ȓ/u)[0].length is 1
-PASS "ሴ".match(/ሴ/)[0].length is 1
+PASS "Ȓ".match(/Ȓ/u)[0].length is 1
 PASS "ሴ".match(/ሴ/u)[0].length is 1
-PASS "⪼".match(/⪼/)[0].length is 1
+PASS "ሴ".match(/ሴ/u)[0].length is 1
+PASS "⪼".match(/⪼/u)[0].length is 1
 PASS "㿭".match(/㿭/u)[0].length is 1
 PASS "𒍅".match(/𒍅/u)[0].length is 2
 PASS "𒍅".match(/𒍅/u)[0].length is 2
-PASS "𝌆".match(/𝌆/)[0].length is 2
+PASS "𝌆".match(/𝌆/u)[0].length is 2
 PASS /𐑏/u.test("𐑏") is true
 PASS /𐑏/u.test("𐑏") is true
 PASS "𝌆".match(/𝌆/u)[0].length is 2
@@ -41,6 +41,8 @@
 PASS "Ťx".match(/ťx/iu)[0].length is 2
 PASS "𝌆".match(/^.$/u)[0].length is 2
 PASS "It is 78°".match(/.*/u)[0].length is 9
+PASS stringWithDanglingFirstSurrogate.match(/.*/u)[0].length is 3
+PASS stringWithDanglingSecondSurrogate.match(/.*/u)[0].length is 3
 PASS "𝌆".match(/[𝌆a]/)[0].length is 1
 PASS "𝌆".match(/[a𝌆]/u)[0].length is 2
 PASS "𝌆".match(/[𝌆a]/u)[0].length is 2
@@ -91,6 +93,22 @@
 PASS /abc/ui.test("ẚbc") is true
 PASS /texẗ/ui.test("text") is true
 PASS /text/ui.test("ẗext") is true
+PASS /\u{1}/.test("u") is true
+PASS /\u{4}/.test("u") is false
+PASS /\u{4}/.test("uuuu") is true
+PASS "800-555-1212".match(/[0-9\-]*/u)[0].length is 12
+PASS "this is ba test".match(/is b\cha test/u)[0].length is 11
+PASS new RegExp("\\/", "u").source is "\\/"
+PASS r = new RegExp("\\u{110000}", "u") threw exception SyntaxError: Invalid regular _expression_: invalid unicode {} escape.
+PASS r = new RegExp("\\-", "u") threw exception SyntaxError: Invalid regular _expression_: invalid escaped character for unicode pattern.
+PASS r = new RegExp("\\a", "u") threw exception SyntaxError: Invalid regular _expression_: invalid escaped character for unicode pattern.
+PASS r = new RegExp("[\\a]", "u") threw exception SyntaxError: Invalid regular _expression_: invalid escaped character for unicode pattern.
+PASS r = new RegExp("[\\b]", "u") threw exception SyntaxError: Invalid regular _expression_: invalid escaped character for unicode pattern.
+PASS r = new RegExp("[\\B]", "u") threw exception SyntaxError: Invalid regular _expression_: invalid escaped character for unicode pattern.
+PASS r = new RegExp("\\x", "u") threw exception SyntaxError: Invalid regular _expression_: invalid escaped character for unicode pattern.
+PASS r = new RegExp("[\\x]", "u") threw exception SyntaxError: Invalid regular _expression_: invalid escaped character for unicode pattern.
+PASS r = new RegExp("\\u", "u") threw exception SyntaxError: Invalid regular _expression_: invalid escaped character for unicode pattern.
+PASS r = new RegExp("[\\u]", "u") threw exception SyntaxError: Invalid regular _expression_: invalid escaped character for unicode pattern.
 PASS successfullyParsed is true
 
 TEST COMPLETE

Modified: trunk/LayoutTests/js/script-tests/regexp-unicode.js (197533 => 197534)


--- trunk/LayoutTests/js/script-tests/regexp-unicode.js	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/LayoutTests/js/script-tests/regexp-unicode.js	2016-03-04 01:24:28 UTC (rev 197534)
@@ -3,19 +3,19 @@
 );
 
 // Test \u{} escapes in a regular _expression_
-shouldBe('"a".match(/\u{61}/)[0].length', '1');
-shouldBe('"a".match(/\u{41}/i)[0].length', '1');
+shouldBe('"a".match(/\u{61}/u)[0].length', '1');
+shouldBe('"a".match(/\u{41}/ui)[0].length', '1');
 shouldBe('"a".match(/\u{061}/u)[0].length', '1');
 shouldBe('"a".match(/\u{041}/iu)[0].length', '1');
-shouldBe('"\u{212}".match(/\u{212}/)[0].length', '1');
+shouldBe('"\u{212}".match(/\u{212}/u)[0].length', '1');
 shouldBe('"\u{212}".match(/\u{0212}/u)[0].length', '1');
-shouldBe('"\u{1234}".match(/\u{1234}/)[0].length', '1');
+shouldBe('"\u{1234}".match(/\u{1234}/u)[0].length', '1');
 shouldBe('"\u{1234}".match(/\u{01234}/u)[0].length', '1');
-shouldBe('"\u{2abc}".match(/\u{2abc}/)[0].length', '1');
+shouldBe('"\u{2abc}".match(/\u{2abc}/u)[0].length', '1');
 shouldBe('"\u{03fed}".match(/\u{03fed}/u)[0].length', '1');
 shouldBe('"\u{12345}".match(/\u{12345}/u)[0].length', '2');
 shouldBe('"\u{12345}".match(/\u{012345}/u)[0].length', '2');
-shouldBe('"\u{1d306}".match(/\u{1d306}/)[0].length', '2');
+shouldBe('"\u{1d306}".match(/\u{1d306}/u)[0].length', '2');
 shouldBeTrue('/\u{1044f}/u.test("\ud801\udc4f")');
 shouldBeTrue('/\ud801\udc4f/u.test("\u{1044f}")');
 
@@ -47,15 +47,16 @@
 // Test . matches with Unicode flag
 shouldBe('"\u{1D306}".match(/^.$/u)[0].length', '2');
 shouldBe('"It is 78\u00B0".match(/.*/u)[0].length', '9');
-// FIXME: These tests are disabled until https://bugs.webkit.org/show_bug.cgi?id=154863 is fixed
-// shouldBe('"\ud801XXX".match(/.*/u)[0].length', '4'); // We should match a dangling first surrogate as 1 character
-// shouldBe('"X\udfffXX".match(/.*/u)[0].length', '4'); // We should match a dangling second surrogate as 1 character
+var stringWithDanglingFirstSurrogate = "X\uD801X";
+shouldBe('stringWithDanglingFirstSurrogate.match(/.*/u)[0].length', '3'); // We should match a dangling first surrogate as 1 character
+var stringWithDanglingSecondSurrogate = "X\uDF01X";
+shouldBe('stringWithDanglingSecondSurrogate.match(/.*/u)[0].length', '3'); // We should match a dangling second surrogate as 1 character
 
 // Test character classes with unicode characters with and without unicode flag
-shouldBe('"\u{1d306}".match(/[\u{1d306}a]/)[0].length', '1');
+shouldBe('"\u{1d306}".match(/[\uD834\uDF06a]/)[0].length', '1');
 shouldBe('"\u{1d306}".match(/[a\u{1d306}]/u)[0].length', '2');
 shouldBe('"\u{1d306}".match(/[\u{1d306}a]/u)[0].length', '2');
-shouldBe('"\u{1d306}".match(/[a-\u{1d306}]/)[0].length', '1');
+shouldBe('"\u{1d306}".match(/[a-\uD834\uDF06]/)[0].length', '1');
 shouldBe('"\u{1d306}".match(/[a-\u{1d306}]/u)[0].length', '2');
 
 // Test a character class that is a range from one UTF16 to a Unicode character
@@ -63,7 +64,7 @@
 shouldBe('"\u1000".match(/[\u0020-\ud801\udc4f]/u)[0].length', '1');
 shouldBe('"\ud801\udc27".match(/[\u0020-\ud801\udc4f]/u)[0].length', '2');
 
-var re1 = new RegExp("[^\u0020-\ud801\udc4f]", "u");
+var re1 = new RegExp("[^\u0020-\uD801\uDC4F]", "u");
 shouldBeFalse('re1.test("Z")');
 shouldBeFalse('re1.test("\u{1000}")');
 shouldBeFalse('re1.test("\u{10400}")');
@@ -135,8 +136,44 @@
 shouldBeUndefined('match6[1]');
 shouldBe('match6[2]', '"\u{10412}\u{10412}"');
 
-// Miscellaneous tests
+// Check unicode case insensitive matches
 shouldBeTrue('/\u1e9Abc/ui.test("abc")');
 shouldBeTrue('/abc/ui.test("\u1e9Abc")');
 shouldBeTrue('/tex\u1e97/ui.test("text")');
 shouldBeTrue('/text/ui.test("\u1e97ext")');
+
+// Verify that without the unicode flag, \u{} doesn't parse to a unicode escapes, but to a counted match of the character 'u'.
+shouldBeTrue('/\\u{1}/.test("u")');
+shouldBeFalse('/\\u{4}/.test("u")');
+shouldBeTrue('/\\u{4}/.test("uuuu")');
+
+// Check that \- escape works in a character class for a unicode pattern
+shouldBe('"800-555-1212".match(/[0-9\\-]*/u)[0].length', '12');
+
+// Check that control letter escapes work with unicode flag
+shouldBe('"this is b\ba test".match(/is b\\cha test/u)[0].length', '11');
+
+// Check that invalid unicode patterns throw exceptions
+shouldBe('new RegExp("\\\\/", "u").source', '"\\\\/"');
+shouldThrow('r = new RegExp("\\\\u{110000}", "u")', '"SyntaxError: Invalid regular _expression_: invalid unicode {} escape"');
+
+var invalidEscapeException = "SyntaxError: Invalid regular _expression_: invalid escaped character for unicode pattern";
+var newRegExp;
+
+function shouldThrowInvalidEscape(pattern)
+{
+    newRegExp = 'r = new RegExp("' + pattern + '", "u")';
+
+    shouldThrow(newRegExp, 'invalidEscapeException');
+}
+
+shouldThrowInvalidEscape("\\\\-");
+shouldThrowInvalidEscape("\\\\a");
+shouldThrowInvalidEscape("[\\\\a]");
+shouldThrowInvalidEscape("[\\\\b]");
+shouldThrowInvalidEscape("[\\\\B]");
+shouldThrowInvalidEscape("\\\\x");
+shouldThrowInvalidEscape("[\\\\x]");
+shouldThrowInvalidEscape("\\\\u");
+shouldThrowInvalidEscape("[\\\\u]");
+

Modified: trunk/Source/_javascript_Core/ChangeLog (197533 => 197534)


--- trunk/Source/_javascript_Core/ChangeLog	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/Source/_javascript_Core/ChangeLog	2016-03-04 01:24:28 UTC (rev 197534)
@@ -1,3 +1,60 @@
+2016-03-03  Michael Saboff  <msab...@apple.com>
+
+        [ES6] Make Unicode RegExp pattern parsing conform to the spec
+        https://bugs.webkit.org/show_bug.cgi?id=154988
+
+        Reviewed by Benjamin Poulain.
+
+        Updated RegExp pattern processing with 'u' (Unicode) flag to conform to the
+        spec (https://tc39.github.io/ecma262/2016/#sec-patterns).  In the spec, the
+        grammar is annotated with [U] annotations.  Productions that are prefixed with
+        [+U] are only available with the Unicode flags while productions prefixed with
+        [~U] are only available without the Unicode flag.
+        
+        Added flags argument to Yarr::checkSyntax() so we can catch Unicode flag related
+        parsing errors at syntax checking time.  Restricted what escapes are available for
+        non Unicode patterns.  Most of this is defined in the IdentityEscape rule in the
+        pattern grammar.
+
+        Added \- as a CharacterClass only escape in Unicode patterns.
+
+        Updated the tests for these changes.
+
+        Made changes suggested in https://bugs.webkit.org/show_bug.cgi?id=154842#c22 after
+        change set r197426 was landed.
+
+        * parser/ASTBuilder.h:
+        (JSC::ASTBuilder::createRegExp):
+        * parser/Parser.cpp:
+        (JSC::Parser<LexerType>::parsePrimaryExpression):
+        * parser/SyntaxChecker.h:
+        (JSC::SyntaxChecker::createRegExp):
+        * yarr/YarrInterpreter.cpp:
+        (JSC::Yarr::Interpreter::InputStream::readChecked):
+        (JSC::Yarr::Interpreter::InputStream::readSurrogatePairChecked):
+        (JSC::Yarr::Interpreter::InputStream::reread):
+        (JSC::Yarr::Interpreter::InputStream::uncheckInput):
+        (JSC::Yarr::Interpreter::InputStream::atStart):
+        (JSC::Yarr::Interpreter::InputStream::atEnd):
+        (JSC::Yarr::Interpreter::testCharacterClass):
+        (JSC::Yarr::Interpreter::backtrackPatternCharacter):
+        (JSC::Yarr::Interpreter::matchDisjunction):
+        (JSC::Yarr::ByteCompiler::atomPatternCharacter):
+        * yarr/YarrParser.h:
+        (JSC::Yarr::Parser::Parser):
+        (JSC::Yarr::Parser::isIdentityEscapeAnError):
+        (JSC::Yarr::Parser::parseEscape):
+        (JSC::Yarr::Parser::parse):
+        * yarr/YarrPattern.cpp:
+        (JSC::Yarr::CharacterClassConstructor::putChar):
+        (JSC::Yarr::CharacterClassConstructor::putRange):
+        (JSC::Yarr::CharacterClassConstructor::addSorted):
+        (JSC::Yarr::YarrPatternConstructor::setupAlternativeOffsets):
+        * yarr/YarrSyntaxChecker.cpp:
+        (JSC::Yarr::SyntaxChecker::disjunction):
+        (JSC::Yarr::checkSyntax):
+        * yarr/YarrSyntaxChecker.h:
+
 2016-03-03  Saam barati  <sbar...@apple.com>
 
         [ES6] Implement Proxy.[[DefineOwnProperty]]

Modified: trunk/Source/_javascript_Core/parser/ASTBuilder.h (197533 => 197534)


--- trunk/Source/_javascript_Core/parser/ASTBuilder.h	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/Source/_javascript_Core/parser/ASTBuilder.h	2016-03-04 01:24:28 UTC (rev 197534)
@@ -311,7 +311,7 @@
 
     ExpressionNode* createRegExp(const JSTokenLocation& location, const Identifier& pattern, const Identifier& flags, const JSTextPosition& start)
     {
-        if (Yarr::checkSyntax(pattern.string()))
+        if (Yarr::checkSyntax(pattern.string(), flags.string()))
             return 0;
         RegExpNode* node = new (m_parserArena) RegExpNode(location, pattern, flags);
         int size = pattern.length() + 2; // + 2 for the two /'s

Modified: trunk/Source/_javascript_Core/parser/Parser.cpp (197533 => 197534)


--- trunk/Source/_javascript_Core/parser/Parser.cpp	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/Source/_javascript_Core/parser/Parser.cpp	2016-03-04 01:24:28 UTC (rev 197534)
@@ -3684,7 +3684,7 @@
         next();
         TreeExpression re = context.createRegExp(location, *pattern, *flags, start);
         if (!re) {
-            const char* yarrErrorMsg = Yarr::checkSyntax(pattern->string());
+            const char* yarrErrorMsg = Yarr::checkSyntax(pattern->string(), flags->string());
             regexFail(yarrErrorMsg);
         }
         return re;

Modified: trunk/Source/_javascript_Core/parser/SyntaxChecker.h (197533 => 197534)


--- trunk/Source/_javascript_Core/parser/SyntaxChecker.h	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/Source/_javascript_Core/parser/SyntaxChecker.h	2016-03-04 01:24:28 UTC (rev 197534)
@@ -172,7 +172,7 @@
     ExpressionType createNull(const JSTokenLocation&) { return NullExpr; }
     ExpressionType createBracketAccess(const JSTokenLocation&, ExpressionType, ExpressionType, bool, int, int, int) { return BracketExpr; }
     ExpressionType createDotAccess(const JSTokenLocation&, ExpressionType, const Identifier*, int, int, int) { return DotExpr; }
-    ExpressionType createRegExp(const JSTokenLocation&, const Identifier& pattern, const Identifier&, int) { return Yarr::checkSyntax(pattern.string()) ? 0 : RegExpExpr; }
+    ExpressionType createRegExp(const JSTokenLocation&, const Identifier& pattern, const Identifier& flags, int) { return Yarr::checkSyntax(pattern.string(), flags.string()) ? 0 : RegExpExpr; }
     ExpressionType createNewExpr(const JSTokenLocation&, ExpressionType, int, int, int, int) { return NewExpr; }
     ExpressionType createNewExpr(const JSTokenLocation&, ExpressionType, int, int) { return NewExpr; }
     ExpressionType createConditionalExpr(const JSTokenLocation&, ExpressionType, ExpressionType, ExpressionType) { return ConditionalExpr; }

Modified: trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp (197533 => 197534)


--- trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/Source/_javascript_Core/yarr/YarrInterpreter.cpp	2016-03-04 01:24:28 UTC (rev 197534)
@@ -208,8 +208,7 @@
             unsigned p = pos - negativePositionOffest;
             ASSERT(p < length);
             int result = input[p];
-            if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length
-                && U16_IS_TRAIL(input[p + 1])) {
+            if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) {
                 if (atEnd())
                     return -1;
                 
@@ -219,17 +218,18 @@
             return result;
         }
         
-        int readSurrogatePairChecked(unsigned negativePositionOffest)
+        int readSurrogatePairChecked(unsigned negativePositionOffset)
         {
-            RELEASE_ASSERT(pos >= negativePositionOffest);
-            unsigned p = pos - negativePositionOffest;
+            RELEASE_ASSERT(pos >= negativePositionOffset);
+            unsigned p = pos - negativePositionOffset;
             ASSERT(p < length);
             if (p + 1 >= length)
                 return -1;
 
             int first = input[p];
-            if (U16_IS_LEAD(first) && U16_IS_TRAIL(input[p + 1]))
-                return U16_GET_SUPPLEMENTARY(first, input[p + 1]);
+            int second = input[p + 1];
+            if (U16_IS_LEAD(first) && U16_IS_TRAIL(second))
+                return U16_GET_SUPPLEMENTARY(first, second);
 
             return -1;
         }
@@ -238,11 +238,8 @@
         {
             ASSERT(from < length);
             int result = input[from];
-            if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length
-                && U16_IS_TRAIL(input[from + 1])) {
-                
+            if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1]))
                 result = U16_GET_SUPPLEMENTARY(result, input[from + 1]);
-            }
             return result;
         }
 
@@ -294,9 +291,9 @@
             pos -= count;
         }
 
-        bool atStart(unsigned negativePositionOffest)
+        bool atStart(unsigned negativePositionOffset)
         {
-            return pos == negativePositionOffest;
+            return pos == negativePositionOffset;
         }
 
         bool atEnd(unsigned negativePositionOffest)
@@ -319,7 +316,7 @@
 
     bool testCharacterClass(CharacterClass* characterClass, int ch)
     {
-        if (ch & 0x1FFF80) {
+        if (!isASCII(ch)) {
             for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i)
                 if (ch == characterClass->m_matchesUnicode[i])
                     return true;
@@ -433,10 +430,7 @@
         case QuantifierGreedy:
             if (backTrack->matchAmount) {
                 --backTrack->matchAmount;
-                if (unicode && !U_IS_BMP(term.atom.patternCharacter))
-                    input.uncheckInput(2);
-                else
-                    input.uncheckInput(1);
+                input.uncheckInput(U16_LENGTH(term.atom.patternCharacter));
                 return true;
             }
             break;
@@ -1267,7 +1261,7 @@
         case ByteTerm::TypePatternCasedCharacterOnce:
         case ByteTerm::TypePatternCasedCharacterFixed: {
             if (unicode) {
-                // Case insensitive matching of unicode charaters are handled as TypeCharacterClass
+                // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
                 ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter));
 
                 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
@@ -1290,7 +1284,7 @@
         case ByteTerm::TypePatternCasedCharacterGreedy: {
             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
 
-            // Case insensitive matching of unicode charaters are handled as TypeCharacterClass
+            // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
             ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
 
             unsigned matchAmount = 0;
@@ -1308,7 +1302,7 @@
         case ByteTerm::TypePatternCasedCharacterNonGreedy: {
             BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
 
-            // Case insensitive matching of unicode charaters are handled as TypeCharacterClass
+            // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
             ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
             
             backTrack->matchAmount = 0;
@@ -1618,9 +1612,6 @@
     void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType)
     {
         if (m_pattern.m_ignoreCase) {
-            ASSERT(u_tolower(ch) <= UCHAR_MAX_VALUE);
-            ASSERT(u_toupper(ch) <= UCHAR_MAX_VALUE);
-
             UChar32 lo = u_tolower(ch);
             UChar32 hi = u_toupper(ch);
 

Modified: trunk/Source/_javascript_Core/yarr/YarrParser.h (197533 => 197534)


--- trunk/Source/_javascript_Core/yarr/YarrParser.h	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/Source/_javascript_Core/yarr/YarrParser.h	2016-03-04 01:24:28 UTC (rev 197534)
@@ -61,6 +61,7 @@
         CharacterClassOutOfOrder,
         EscapeUnterminated,
         InvalidUnicodeEscape,
+        InvalidIdentityEscape,
         NumberOfErrorCodes
     };
 
@@ -241,6 +242,19 @@
     {
     }
 
+    // The handling of IdentityEscapes is different depending on the unicode flag.
+    // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'.
+    // For non-unicode patterns, most any character can be escaped.
+    bool isIdentityEscapeAnError(int ch)
+    {
+        if (m_isUnicode && !strchr("^$\\.*+?()[]{}|/", ch)) {
+            m_err = InvalidIdentityEscape;
+            return true;
+        }
+
+        return false;
+    }
+
     /*
      * parseEscape():
      *
@@ -277,18 +291,24 @@
         // Assertions
         case 'b':
             consume();
-            if (inCharacterClass)
+            if (inCharacterClass) {
+                if (isIdentityEscapeAnError('b'))
+                    break;
+
                 delegate.atomPatternCharacter('\b');
-            else {
+            } else {
                 delegate.assertionWordBoundary(false);
                 return false;
             }
             break;
         case 'B':
             consume();
-            if (inCharacterClass)
+            if (inCharacterClass) {
+                if (isIdentityEscapeAnError('B'))
+                    break;
+
                 delegate.atomPatternCharacter('B');
-            else {
+            } else {
                 delegate.assertionWordBoundary(true);
                 return false;
             }
@@ -403,9 +423,12 @@
         case 'x': {
             consume();
             int x = tryConsumeHex(2);
-            if (x == -1)
+            if (x == -1) {
+                if (isIdentityEscapeAnError('x'))
+                    break;
+
                 delegate.atomPatternCharacter('x');
-            else
+            } else
                 delegate.atomPatternCharacter(x);
             break;
         }
@@ -414,20 +437,23 @@
         case 'u': {
             consume();
             if (atEndOfPattern()) {
+                if (isIdentityEscapeAnError('u'))
+                    break;
+
                 delegate.atomPatternCharacter('u');
                 break;
             }
 
-            if (peek() == '{') {
+            if (m_isUnicode && peek() == '{') {
                 consume();
                 UChar32 codePoint = 0;
                 do {
                     if (atEndOfPattern())
                         m_err = InvalidUnicodeEscape;
-                    if (!WTF::isASCIIHexDigit(peek()))
+                    if (!isASCIIHexDigit(peek()))
                         m_err = InvalidUnicodeEscape;
 
-                    codePoint = (codePoint << 4) | WTF::toASCIIHexValue(consume());
+                    codePoint = (codePoint << 4) | toASCIIHexValue(consume());
 
                     if (codePoint > UCHAR_MAX_VALUE)
                         m_err = InvalidUnicodeEscape;
@@ -441,9 +467,12 @@
                 break;
             }
             int u = tryConsumeHex(4);
-            if (u == -1)
+            if (u == -1) {
+                if (isIdentityEscapeAnError('u'))
+                    break;
+
                 delegate.atomPatternCharacter('u');
-            else {
+            } else {
                 // If we have the first of a surrogate pair, look for the second.
                 if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') {
                     ParseState state = saveState();
@@ -467,6 +496,17 @@
 
         // IdentityEscape
         default:
+            int ch = peek();
+
+            if (ch == '-' && m_isUnicode && inCharacterClass) {
+                // \- is allowed for ClassEscape with unicode flag.
+                delegate.atomPatternCharacter(consume());
+                break;
+            }
+
+            if (isIdentityEscapeAnError(ch))
+                break;
+
             delegate.atomPatternCharacter(consume());
         }
         
@@ -762,8 +802,9 @@
             REGEXP_ERROR_PREFIX "unrecognized character after (?",
             REGEXP_ERROR_PREFIX "missing terminating ] for character class",
             REGEXP_ERROR_PREFIX "range out of order in character class",
-            REGEXP_ERROR_PREFIX "\\ at end of pattern"
-            REGEXP_ERROR_PREFIX "invalid unicode {} escape"
+            REGEXP_ERROR_PREFIX "\\ at end of pattern",
+            REGEXP_ERROR_PREFIX "invalid unicode {} escape",
+            REGEXP_ERROR_PREFIX "invalid escaped character for unicode pattern"
         };
 
         return errorMessages[m_err];

Modified: trunk/Source/_javascript_Core/yarr/YarrPattern.cpp (197533 => 197534)


--- trunk/Source/_javascript_Core/yarr/YarrPattern.cpp	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/Source/_javascript_Core/yarr/YarrPattern.cpp	2016-03-04 01:24:28 UTC (rev 197534)
@@ -69,7 +69,7 @@
     void putChar(UChar32 ch)
     {
         // Handle ascii cases.
-        if (ch <= 0x7f) {
+        if (isASCII(ch)) {
             if (m_isCaseInsensitive && isASCIIAlpha(ch)) {
                 addSorted(m_matches, toASCIIUpper(ch));
                 addSorted(m_matches, toASCIILower(ch));
@@ -108,7 +108,7 @@
 
     void putRange(UChar32 lo, UChar32 hi)
     {
-        if (lo <= 0x7f) {
+        if (isASCII(lo)) {
             char asciiLo = lo;
             char asciiHi = std::min(hi, (UChar32)0x7f);
             addSortedRange(m_ranges, lo, asciiHi);
@@ -120,7 +120,7 @@
                     addSortedRange(m_ranges, std::max(asciiLo, 'a')+('A'-'a'), std::min(asciiHi, 'z')+('A'-'a'));
             }
         }
-        if (hi <= 0x7f)
+        if (isASCII(hi))
             return;
 
         lo = std::max(lo, (UChar32)0x80);
@@ -190,7 +190,7 @@
 private:
     void addSorted(UChar32 ch)
     {
-        addSorted(ch <= 0x7f ? m_matches : m_matchesUnicode, ch);
+        addSorted(isASCII(ch) ? m_matches : m_matchesUnicode, ch);
     }
 
     void addSorted(Vector<UChar32>& matches, UChar32 ch)
@@ -603,7 +603,7 @@
                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
                     alternative->m_hasFixedSize = false;
                 } else if (m_pattern.m_unicode) {
-                    currentInputPosition += (!U_IS_BMP(term.patternCharacter) ? 2 : 1) * term.quantityCount;
+                    currentInputPosition += U16_LENGTH(term.patternCharacter) * term.quantityCount;
                 } else
                     currentInputPosition += term.quantityCount;
                 break;

Modified: trunk/Source/_javascript_Core/yarr/YarrSyntaxChecker.cpp (197533 => 197534)


--- trunk/Source/_javascript_Core/yarr/YarrSyntaxChecker.cpp	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/Source/_javascript_Core/yarr/YarrSyntaxChecker.cpp	2016-03-04 01:24:28 UTC (rev 197534)
@@ -50,10 +50,10 @@
     void disjunction() {}
 };
 
-const char* checkSyntax(const String& pattern)
+const char* checkSyntax(const String& pattern, const String& flags)
 {
     SyntaxChecker syntaxChecker;
-    return parse(syntaxChecker, pattern, false);
+    return parse(syntaxChecker, pattern, flags.contains('u'));
 }
 
 }} // JSC::YARR

Modified: trunk/Source/_javascript_Core/yarr/YarrSyntaxChecker.h (197533 => 197534)


--- trunk/Source/_javascript_Core/yarr/YarrSyntaxChecker.h	2016-03-04 01:07:04 UTC (rev 197533)
+++ trunk/Source/_javascript_Core/yarr/YarrSyntaxChecker.h	2016-03-04 01:24:28 UTC (rev 197534)
@@ -30,7 +30,7 @@
 
 namespace JSC { namespace Yarr {
 
-const char* checkSyntax(const String& pattern);
+const char* checkSyntax(const String& pattern, const String& flags);
 
 }} // JSC::YARR
 
_______________________________________________
webkit-changes mailing list
webkit-changes@lists.webkit.org
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to