(impala) branch master updated: IMPALA-13525: Handle escaped characters in string literal

stigahuang Wed, 12 Feb 2025 17:07:11 -0800

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git



The following commit(s) were added to refs/heads/master by this push:
     new f50514a26 IMPALA-13525: Handle escaped characters in string literal
f50514a26 is described below

commit f50514a26557b75e476f6a02772cf57d8cfef6fa
Author: Steve Carlin <[email protected]>
AuthorDate: Wed Nov 6 15:54:15 2024 -0800

    IMPALA-13525: Handle escaped characters in string literal
    
    Changed the parser to handle escaped characters. The method
    is in a new class called ParserUtil. The method was copied
    from Calcite's SqlParserUtil, but one change was needed. The
    Calcite method did not handle a backslash in front of a regex
    character. For Impala, if we detect the backslash in front of
    a regex character, we leave the character but remove the
    backslash.  This is tested in exprs.test
    
    Change-Id: I9b0fbe591d1101350b2ba0f6ddb2967b819ee685
    Reviewed-on: http://gerrit.cloudera.org:8080/22106
    Reviewed-by: Aman Sinha <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 .../src/main/codegen/templates/Parser.jj           |   9 +
 .../org/apache/impala/calcite/util/ParserUtil.java | 193 +++++++++++++++++++++
 2 files changed, 202 insertions(+)

diff --git a/java/calcite-planner/src/main/codegen/templates/Parser.jj 
b/java/calcite-planner/src/main/codegen/templates/Parser.jj
index 09dacd69c..b4093d7ef 100644
--- a/java/calcite-planner/src/main/codegen/templates/Parser.jj
+++ b/java/calcite-planner/src/main/codegen/templates/Parser.jj
@@ -123,6 +123,7 @@ import org.apache.calcite.util.Util;
 import org.apache.calcite.util.trace.CalciteTrace;
 import org.apache.impala.calcite.type.ImpalaSqlIntervalQualifier;
 import org.apache.impala.calcite.operators.ImpalaCustomOperatorTable;
+import org.apache.impala.calcite.util.ParserUtil;
 
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
@@ -4611,10 +4612,14 @@ SqlNode StringLiteral() :
         p = SqlParserUtil.stripQuotes(getToken(0).image, DQ, DQ, "\\\"",
             Casing.UNCHANGED);
         try {
+            p = ParserUtil.replaceEscapedChars(p);
             return SqlLiteral.createCharString(p, charSet, getPos());
         } catch (java.nio.charset.UnsupportedCharsetException e) {
             throw SqlUtil.newContextException(getPos(),
                 RESOURCE.unknownCharacterSet(charSet));
+        } catch (ParserUtil.MalformedUnicodeEscape e) {
+            throw SqlUtil.newContextException(getPos(),
+                RESOURCE.unicodeEscapeMalformed(e.i));
         }
     }
 |
@@ -4623,10 +4628,14 @@ SqlNode StringLiteral() :
         p = SqlParserUtil.stripQuotes(getToken(0).image, "'", "'", "\\'",
             Casing.UNCHANGED);
         try {
+            p = ParserUtil.replaceEscapedChars(p);
             return SqlLiteral.createCharString(p, charSet, getPos());
         } catch (java.nio.charset.UnsupportedCharsetException e) {
             throw SqlUtil.newContextException(getPos(),
                 RESOURCE.unknownCharacterSet(charSet));
+        } catch (ParserUtil.MalformedUnicodeEscape e) {
+            throw SqlUtil.newContextException(getPos(),
+                RESOURCE.unicodeEscapeMalformed(e.i));
         }
     }
 }
diff --git 
a/java/calcite-planner/src/main/java/org/apache/impala/calcite/util/ParserUtil.java
 
b/java/calcite-planner/src/main/java/org/apache/impala/calcite/util/ParserUtil.java
new file mode 100644
index 000000000..c3275af7c
--- /dev/null
+++ 
b/java/calcite-planner/src/main/java/org/apache/impala/calcite/util/ParserUtil.java
@@ -0,0 +1,193 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.impala.calcite.util;
+
+import org.apache.calcite.sql.parser.SqlParserUtil;
+
+import static java.lang.Integer.parseInt;
+import java.util.function.Predicate;
+
+/**
+ * Util functions for parser
+ *
+ * IMPALA NOTE: This was copied from SqlParserUtil from Calcite 1.37 with one
+ * modification. If there is a regexp character following the
+ * backslash, we remove the backslash. The rest of the method is
+ * copied as/is. The change is noted in a comment below (search for IMPALA)
+ */
+public class ParserUtil {
+  /**
+   * replaceEscapeChars
+   *
+   * Converts the contents of a character literal  with escapes like those used
+   * in the C programming language to the corresponding Java string
+   * representation.
+   *
+   * <p>If the literal "{@code E'a\tc'}" occurs in the SQL source text, then
+   * this method will be invoked with the string "{@code a\tc}" (4 characters)
+   * and will return a Java string with the three characters 'a', TAB, 'b'.
+   *
+   * @param input String that contains C-style escapes
+   * @return String with escapes converted into Java characters
+   * @throws MalformedUnicodeEscape if input contains invalid unicode escapes
+   */
+  public static String replaceEscapedChars(String input)
+      throws MalformedUnicodeEscape {
+    // The implementation of this method is based on Crate's method
+    // Literals.replaceEscapedChars.
+    final int length = input.length();
+    if (length <= 1) {
+      return input;
+    }
+    final StringBuilder builder = new StringBuilder(length);
+    int endIdx;
+    for (int i = 0; i < length; i++) {
+      char currentChar = input.charAt(i);
+      if (currentChar == '\\' && i + 1 < length) {
+        char nextChar = input.charAt(i + 1);
+        switch (nextChar) {
+        case 'b':
+          builder.append('\b');
+          i++;
+          break;
+        case 'f':
+          builder.append('\f');
+          i++;
+          break;
+        case 'n':
+          builder.append('\n');
+          i++;
+          break;
+        case 'r':
+          builder.append('\r');
+          i++;
+          break;
+        case 't':
+          builder.append('\t');
+          i++;
+          break;
+        case '\\':
+        case '\'':
+        // IMPALA NOTE: Impala also allows escaping of these regexp 
characters. We
+        // just remove the backslash here
+        case '.':
+        case '+':
+        case '*':
+        case '?':
+        case '^':
+        case '$':
+        case '(':
+        case ')':
+        case '[':
+        case ']':
+        case '{':
+        case '}':
+        case '|':
+          builder.append(nextChar);
+          i++;
+          break;
+        case 'u':
+        case 'U':
+          // handle unicode case
+          final int charsToConsume = (nextChar == 'u') ? 4 : 8;
+          if (i + 1 + charsToConsume >= length) {
+            throw new MalformedUnicodeEscape(i);
+          }
+          endIdx =
+              calculateMaxCharsInSequence(input, i + 2, charsToConsume,
+                  SqlParserUtil::isHexDigit);
+          if (endIdx != i + 2 + charsToConsume) {
+            throw new MalformedUnicodeEscape(i);
+          }
+          builder.appendCodePoint(parseInt(input.substring(i + 2, endIdx), 
16));
+          i = endIdx - 1; // skip already consumed chars
+          break;
+        case 'x':
+          // handle hex byte case - up to 2 chars for hex value
+          endIdx =
+              calculateMaxCharsInSequence(input, i + 2, 2,
+                  SqlParserUtil::isHexDigit);
+          if (endIdx > i + 2) {
+            builder.appendCodePoint(parseInt(input.substring(i + 2, endIdx), 
16));
+            i = endIdx - 1; // skip already consumed chars
+          } else {
+            // hex sequence unmatched - output original char
+            builder.append(nextChar);
+            i++;
+          }
+          break;
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+          // handle octal case - up to 3 chars
+          endIdx =
+              calculateMaxCharsInSequence(input, i + 2,
+                  2, // first char is already "consumed"
+                  SqlParserUtil::isOctalDigit);
+          builder.appendCodePoint(parseInt(input.substring(i + 1, endIdx), 8));
+          i = endIdx - 1; // skip already consumed chars
+          break;
+        default:
+          // non-valid escaped char sequence
+           builder.append(currentChar);
+        }
+      } else {
+        builder.append(currentChar);
+      }
+    }
+    return builder.toString();
+  }
+
+  /**
+   * Calculates the maximum number of consecutive characters of the
+   * {@link CharSequence} argument, starting from {@code beginIndex}, that 
match
+   * a given {@link Predicate}. The number of characters to match are either
+   * capped from the {@code maxCharsToMatch} parameter or the sequence length.
+   *
+   * <p>Examples:
+   * <pre>
+   * {@code
+   *    calculateMaxCharsInSequence("12345", 0, 2, Character::isDigit) -> 2
+   *    calculateMaxCharsInSequence("12345", 3, 2, Character::isDigit) -> 5
+   *    calculateMaxCharsInSequence("12345", 4, 2, Character::isDigit) -> 5
+   * }
+   * </pre>
+   *
+   * @return the index of the first non-matching character
+   */
+  private static int calculateMaxCharsInSequence(CharSequence seq,
+      int beginIndex,
+      int maxCharsToMatch,
+      Predicate<Character> predicate) {
+    int idx = beginIndex;
+    final int end = Math.min(seq.length(), beginIndex + maxCharsToMatch);
+    while (idx < end && predicate.test(seq.charAt(idx))) {
+      idx++;
+    }
+    return idx;
+  }
+  /** Thrown by {@link #replaceEscapedChars(String)}. */
+  public static class MalformedUnicodeEscape extends Exception {
+    public final int i;
+
+    MalformedUnicodeEscape(int i) {
+      this.i = i;
+    }
+  }
+}

(impala) branch master updated: IMPALA-13525: Handle escaped characters in string literal

Reply via email to