This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new f50514a26 IMPALA-13525: Handle escaped characters in string literal
f50514a26 is described below
commit f50514a26557b75e476f6a02772cf57d8cfef6fa
Author: Steve Carlin <[email protected]>
AuthorDate: Wed Nov 6 15:54:15 2024 -0800
IMPALA-13525: Handle escaped characters in string literal
Changed the parser to handle escaped characters. The method
is in a new class called ParserUtil. The method was copied
from Calcite's SqlParserUtil, but one change was needed. The
Calcite method did not handle a backslash in front of a regex
character. For Impala, if we detect the backslash in front of
a regex character, we leave the character but remove the
backslash. This is tested in exprs.test
Change-Id: I9b0fbe591d1101350b2ba0f6ddb2967b819ee685
Reviewed-on: http://gerrit.cloudera.org:8080/22106
Reviewed-by: Aman Sinha <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
.../src/main/codegen/templates/Parser.jj | 9 +
.../org/apache/impala/calcite/util/ParserUtil.java | 193 +++++++++++++++++++++
2 files changed, 202 insertions(+)
diff --git a/java/calcite-planner/src/main/codegen/templates/Parser.jj
b/java/calcite-planner/src/main/codegen/templates/Parser.jj
index 09dacd69c..b4093d7ef 100644
--- a/java/calcite-planner/src/main/codegen/templates/Parser.jj
+++ b/java/calcite-planner/src/main/codegen/templates/Parser.jj
@@ -123,6 +123,7 @@ import org.apache.calcite.util.Util;
import org.apache.calcite.util.trace.CalciteTrace;
import org.apache.impala.calcite.type.ImpalaSqlIntervalQualifier;
import org.apache.impala.calcite.operators.ImpalaCustomOperatorTable;
+import org.apache.impala.calcite.util.ParserUtil;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
@@ -4611,10 +4612,14 @@ SqlNode StringLiteral() :
p = SqlParserUtil.stripQuotes(getToken(0).image, DQ, DQ, "\\\"",
Casing.UNCHANGED);
try {
+ p = ParserUtil.replaceEscapedChars(p);
return SqlLiteral.createCharString(p, charSet, getPos());
} catch (java.nio.charset.UnsupportedCharsetException e) {
throw SqlUtil.newContextException(getPos(),
RESOURCE.unknownCharacterSet(charSet));
+ } catch (ParserUtil.MalformedUnicodeEscape e) {
+ throw SqlUtil.newContextException(getPos(),
+ RESOURCE.unicodeEscapeMalformed(e.i));
}
}
|
@@ -4623,10 +4628,14 @@ SqlNode StringLiteral() :
p = SqlParserUtil.stripQuotes(getToken(0).image, "'", "'", "\\'",
Casing.UNCHANGED);
try {
+ p = ParserUtil.replaceEscapedChars(p);
return SqlLiteral.createCharString(p, charSet, getPos());
} catch (java.nio.charset.UnsupportedCharsetException e) {
throw SqlUtil.newContextException(getPos(),
RESOURCE.unknownCharacterSet(charSet));
+ } catch (ParserUtil.MalformedUnicodeEscape e) {
+ throw SqlUtil.newContextException(getPos(),
+ RESOURCE.unicodeEscapeMalformed(e.i));
}
}
}
diff --git
a/java/calcite-planner/src/main/java/org/apache/impala/calcite/util/ParserUtil.java
b/java/calcite-planner/src/main/java/org/apache/impala/calcite/util/ParserUtil.java
new file mode 100644
index 000000000..c3275af7c
--- /dev/null
+++
b/java/calcite-planner/src/main/java/org/apache/impala/calcite/util/ParserUtil.java
@@ -0,0 +1,193 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.calcite.util;
+
+import org.apache.calcite.sql.parser.SqlParserUtil;
+
+import static java.lang.Integer.parseInt;
+import java.util.function.Predicate;
+
+/**
+ * Util functions for parser
+ *
+ * IMPALA NOTE: This was copied from SqlParserUtil from Calcite 1.37 with one
+ * modification. If there is a regexp character following the
+ * backslash, we remove the backslash. The rest of the method is
+ * copied as/is. The change is noted in a comment below (search for IMPALA)
+ */
+public class ParserUtil {
+ /**
+ * replaceEscapeChars
+ *
+ * Converts the contents of a character literal with escapes like those used
+ * in the C programming language to the corresponding Java string
+ * representation.
+ *
+ * <p>If the literal "{@code E'a\tc'}" occurs in the SQL source text, then
+ * this method will be invoked with the string "{@code a\tc}" (4 characters)
+ * and will return a Java string with the three characters 'a', TAB, 'b'.
+ *
+ * @param input String that contains C-style escapes
+ * @return String with escapes converted into Java characters
+ * @throws MalformedUnicodeEscape if input contains invalid unicode escapes
+ */
+ public static String replaceEscapedChars(String input)
+ throws MalformedUnicodeEscape {
+ // The implementation of this method is based on Crate's method
+ // Literals.replaceEscapedChars.
+ final int length = input.length();
+ if (length <= 1) {
+ return input;
+ }
+ final StringBuilder builder = new StringBuilder(length);
+ int endIdx;
+ for (int i = 0; i < length; i++) {
+ char currentChar = input.charAt(i);
+ if (currentChar == '\\' && i + 1 < length) {
+ char nextChar = input.charAt(i + 1);
+ switch (nextChar) {
+ case 'b':
+ builder.append('\b');
+ i++;
+ break;
+ case 'f':
+ builder.append('\f');
+ i++;
+ break;
+ case 'n':
+ builder.append('\n');
+ i++;
+ break;
+ case 'r':
+ builder.append('\r');
+ i++;
+ break;
+ case 't':
+ builder.append('\t');
+ i++;
+ break;
+ case '\\':
+ case '\'':
+ // IMPALA NOTE: Impala also allows escaping of these regexp
characters. We
+ // just remove the backslash here
+ case '.':
+ case '+':
+ case '*':
+ case '?':
+ case '^':
+ case '$':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case '{':
+ case '}':
+ case '|':
+ builder.append(nextChar);
+ i++;
+ break;
+ case 'u':
+ case 'U':
+ // handle unicode case
+ final int charsToConsume = (nextChar == 'u') ? 4 : 8;
+ if (i + 1 + charsToConsume >= length) {
+ throw new MalformedUnicodeEscape(i);
+ }
+ endIdx =
+ calculateMaxCharsInSequence(input, i + 2, charsToConsume,
+ SqlParserUtil::isHexDigit);
+ if (endIdx != i + 2 + charsToConsume) {
+ throw new MalformedUnicodeEscape(i);
+ }
+ builder.appendCodePoint(parseInt(input.substring(i + 2, endIdx),
16));
+ i = endIdx - 1; // skip already consumed chars
+ break;
+ case 'x':
+ // handle hex byte case - up to 2 chars for hex value
+ endIdx =
+ calculateMaxCharsInSequence(input, i + 2, 2,
+ SqlParserUtil::isHexDigit);
+ if (endIdx > i + 2) {
+ builder.appendCodePoint(parseInt(input.substring(i + 2, endIdx),
16));
+ i = endIdx - 1; // skip already consumed chars
+ } else {
+ // hex sequence unmatched - output original char
+ builder.append(nextChar);
+ i++;
+ }
+ break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ // handle octal case - up to 3 chars
+ endIdx =
+ calculateMaxCharsInSequence(input, i + 2,
+ 2, // first char is already "consumed"
+ SqlParserUtil::isOctalDigit);
+ builder.appendCodePoint(parseInt(input.substring(i + 1, endIdx), 8));
+ i = endIdx - 1; // skip already consumed chars
+ break;
+ default:
+ // non-valid escaped char sequence
+ builder.append(currentChar);
+ }
+ } else {
+ builder.append(currentChar);
+ }
+ }
+ return builder.toString();
+ }
+
+ /**
+ * Calculates the maximum number of consecutive characters of the
+ * {@link CharSequence} argument, starting from {@code beginIndex}, that
match
+ * a given {@link Predicate}. The number of characters to match are either
+ * capped from the {@code maxCharsToMatch} parameter or the sequence length.
+ *
+ * <p>Examples:
+ * <pre>
+ * {@code
+ * calculateMaxCharsInSequence("12345", 0, 2, Character::isDigit) -> 2
+ * calculateMaxCharsInSequence("12345", 3, 2, Character::isDigit) -> 5
+ * calculateMaxCharsInSequence("12345", 4, 2, Character::isDigit) -> 5
+ * }
+ * </pre>
+ *
+ * @return the index of the first non-matching character
+ */
+ private static int calculateMaxCharsInSequence(CharSequence seq,
+ int beginIndex,
+ int maxCharsToMatch,
+ Predicate<Character> predicate) {
+ int idx = beginIndex;
+ final int end = Math.min(seq.length(), beginIndex + maxCharsToMatch);
+ while (idx < end && predicate.test(seq.charAt(idx))) {
+ idx++;
+ }
+ return idx;
+ }
+ /** Thrown by {@link #replaceEscapedChars(String)}. */
+ public static class MalformedUnicodeEscape extends Exception {
+ public final int i;
+
+ MalformedUnicodeEscape(int i) {
+ this.i = i;
+ }
+ }
+}