(impala) branch master updated: IMPALA-12718: Provides UTF-8 support for the trim functions

stigahuang Fri, 02 Feb 2024 04:23:48 -0800

This is an automated email from the ASF dual-hosted git repository.

stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git



The following commit(s) were added to refs/heads/master by this push:
     new e489ab35b IMPALA-12718: Provides UTF-8 support for the trim functions
e489ab35b is described below

commit e489ab35b1610aa387bc10caf466bbe0aafba19b
Author: Eyizoha <[email protected]>
AuthorDate: Fri Jan 19 18:22:30 2024 +0800

    IMPALA-12718: Provides UTF-8 support for the trim functions
    
    Currently, the trim function (including BTRIM, LTRIM, RTRIM) cannot
    correctly handle strings containing multi-byte UTF-8 characters.
    Multi-byte UTF-8 characters are interpreted as multiple single-byte
    characters, leading to unexpected results.
    
    This patch provides UTF-8 support for the trim functions, enabling these
    functions to correctly handle multi-byte UTF-8 characters (when set
    utf8_mode=true). It also introduces a set of trim functions with the
    'utf8_' prefix, offering the same capability even when utf8_mode is not
    enabled.
    
    Testing:
     - Added new BE test case in ExprTest#Utf8Test
     - Added new E2E test case in TestUtf8StringFunctions
    
    Change-Id: I5cfaffd71009f16eae75910af835bd2a34410856
    Reviewed-on: http://gerrit.cloudera.org:8080/20926
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 be/src/exprs/expr-test.cc                          |  32 +++++
 be/src/exprs/string-functions-ir.cc                | 141 +++++++++++++++++----
 be/src/exprs/string-functions.h                    |  43 +++++++
 be/src/util/bit-util.h                             |  55 ++++++--
 be/src/util/string-util.cc                         |   4 +-
 common/function-registry/impala_functions.py       |  21 +++
 .../queries/QueryTest/utf8-string-functions.test   |  85 +++++++++++++
 7 files changed, 342 insertions(+), 39 deletions(-)

diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index a26a5ecf4..52b6b0a3d 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -10987,6 +10987,15 @@ TEST_P(ExprTest, Utf8Test) {
       "\U0001f467\u200d\U0001f467\u200d\U0001f468\u200d\U0001f468"
       "\u0bbf\u0ba8\u0940\u0928");
 
+  // Tests utf8_*trim() with UTF-8 characters.
+  TestStringValue("utf8_trim(' hello你好👋 ')", "hello你好👋");
+  TestStringValue("utf8_rtrim(' hello你好👋 ')", " hello你好👋");
+  TestStringValue("utf8_ltrim(' hello你好👋 ')", "hello你好👋 ");
+  TestStringValue("utf8_btrim(' hello你好👋 ')", "hello你好👋");
+  TestStringValue("utf8_rtrim('hello你好👋', '👋hello')", "hello你好");
+  TestStringValue("utf8_ltrim('hello你好👋', '👋hello')", "你好👋");
+  TestStringValue("utf8_btrim('hello你好👋', '👋hello')", "你好");
+
   executor_->PushExecOption("utf8_mode=true");
   // Each Chinese character is encoded into 3 bytes. But in UTF-8 mode, the 
positions
   // are counted by UTF-8 characters.
@@ -11099,6 +11108,29 @@ TEST_P(ExprTest, Utf8Test) {
   TestStringValue("lower('ÁBĆ\\0ÈFĞ')", string("ábć\0èfğ", 11));
   TestStringValue("initcap('ábć\\0ÈFĞ')", string("Ábć\0èfğ", 11));
 
+  // Tests *trim() with UTF-8 characters in UTF8_MODE.
+  TestStringValue("trim('  hello 你好 👋 ')", "hello 你好 👋");
+  TestStringValue("ltrim(' hello 你好 👋 ')", "hello 你好 👋 ");
+  TestStringValue("rtrim(' hello 你好 👋 ')", " hello 你好 👋");
+  TestStringValue("btrim(' hello 你好 👋 ')", "hello 你好 👋");
+
+  TestStringValue("ltrim('ÁáBbĆć', 'ÁBĆábć')", "");
+  TestStringValue("rtrim('price价格，', '，')", "price价格");
+
+  TestStringValue("rtrim('hello你好👋', '👋hello')", "hello你好");
+  TestStringValue("ltrim('hello你好👋', '👋hello')", "你好👋");
+  TestStringValue("btrim('hello你好👋', '👋hello')", "你好");
+
+  TestStringValue("rtrim('🍎🍐🍊🍋🍌', '🍌🍋🍐🍎')", "🍎🍐🍊");
+  TestStringValue("ltrim('🍎🍐🍊🍋🍌', '🍌🍋🍐🍎')", "🍊🍋🍌");
+  TestStringValue("btrim('🍎🍐🍊🍋🍌', '🍌🍋🍐🍎')", "🍊");
+
+  TestStringValue("btrim('water💧水вода', 'вода水💧water')", "");
+  TestStringValue("btrim('fire🔥火огонь', 'огонь火🔥fire')", "");
+
+  // There are 'Zero Width Joiner' between emojis.
+  TestStringValue("btrim('👨‍👩‍👧‍👦', '👧‍👦')", "👨‍👩");
+
   executor_->PopExecOption();
 }
 
diff --git a/be/src/exprs/string-functions-ir.cc 
b/be/src/exprs/string-functions-ir.cc
index ad34fcd64..de8c8d7a5 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -42,6 +42,7 @@
 
 using namespace impala_udf;
 using std::bitset;
+using std::any_of;
 
 // NOTE: be careful not to use string::append.  It is not performant.
 namespace impala {
@@ -690,35 +691,90 @@ StringVal StringFunctions::Translate(FunctionContext* 
context, const StringVal&
   return result;
 }
 
-void StringFunctions::TrimPrepare(
-    FunctionContext* context, FunctionContext::FunctionStateScope scope) {
+void StringFunctions::TrimContext::Reset(const StringVal& chars_to_trim) {
+  single_byte_chars_.reset();
+  double_byte_chars_.clear();
+  triple_byte_chars_.clear();
+  quadruple_byte_chars_.clear();
+
+  if (!utf8_mode_) {
+    for (size_t i = 0; i < chars_to_trim.len; ++i) {
+      single_byte_chars_.set(chars_to_trim.ptr[i], true);
+    }
+    return;
+  }
+
+  for (size_t i = 0, char_size = 0; i < chars_to_trim.len; i += char_size) {
+    char_size = BitUtil::NumBytesInUtf8Encoding(chars_to_trim.ptr[i]);
+
+    // If the remaining number of bytes does not match the number of bytes 
specified by
+    // the UTF-8 character, we may have encountered an illegal UTF-8 character.
+    // In order to prevent subsequent data access from going out of bounds, 
restrictions
+    // are placed here to ensure that accessing pointers to multi-byte 
characters is
+    // always safe.
+    if (UNLIKELY(i + char_size > chars_to_trim.len)) {
+      char_size = chars_to_trim.len - i;
+    }
+
+    switch (char_size) {
+      case 1: single_byte_chars_.set(chars_to_trim.ptr[i], true); break;
+      case 2: double_byte_chars_.push_back(&chars_to_trim.ptr[i]); break;
+      case 3: triple_byte_chars_.push_back(&chars_to_trim.ptr[i]); break;
+      case 4: quadruple_byte_chars_.push_back(&chars_to_trim.ptr[i]); break;
+      default: DCHECK(false); break;
+    }
+  }
+}
+
+bool StringFunctions::TrimContext::Contains(const uint8_t* utf8_char, int len) 
const {
+  auto eq = [&](const uint8_t* c){ return memcmp(c, utf8_char, len) == 0; };
+  switch (len) {
+    case 1: return single_byte_chars_.test(*utf8_char);
+    case 2: return any_of(double_byte_chars_.begin(), 
double_byte_chars_.end(), eq);
+    case 3: return any_of(triple_byte_chars_.begin(), 
triple_byte_chars_.end(), eq);
+    case 4: return any_of(quadruple_byte_chars_.begin(), 
quadruple_byte_chars_.end(), eq);
+    default: DCHECK(false); return false;
+  }
+}
+
+void StringFunctions::TrimPrepare(FunctionContext* context,
+    FunctionContext::FunctionStateScope scope) {
+  bool utf8_mode = 
context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE);
+  DoTrimPrepare(context, scope, utf8_mode);
+}
+
+void StringFunctions::Utf8TrimPrepare(FunctionContext* context,
+    FunctionContext::FunctionStateScope scope) {
+  DoTrimPrepare(context, scope, true /* utf8_mode */);
+}
+
+void StringFunctions::DoTrimPrepare(FunctionContext* context,
+    FunctionContext::FunctionStateScope scope, bool utf8_mode) {
   if (scope != FunctionContext::THREAD_LOCAL) return;
-  // Create a bitset to hold the unique characters to trim.
-  bitset<256>* unique_chars = new bitset<256>();
-  context->SetFunctionState(scope, unique_chars);
+  TrimContext* trim_ctx = new TrimContext(utf8_mode);
+  context->SetFunctionState(scope, trim_ctx);
+
   // If the caller didn't specify the set of characters to trim, it means
   // that we're only trimming whitespace. Return early in that case.
   // There can be either 1 or 2 arguments.
   DCHECK(context->GetNumArgs() == 1 || context->GetNumArgs() == 2);
   if (context->GetNumArgs() == 1) {
-    unique_chars->set(static_cast<int>(' '), true);
+    trim_ctx->Reset(StringVal(" "));
     return;
   }
   if (!context->IsArgConstant(1)) return;
   DCHECK_EQ(context->GetArgType(1)->type, FunctionContext::TYPE_STRING);
   StringVal* chars_to_trim = 
reinterpret_cast<StringVal*>(context->GetConstantArg(1));
   if (chars_to_trim->is_null) return; // We shouldn't peek into Null StringVals
-  for (int32_t i = 0; i < chars_to_trim->len; ++i) {
-    unique_chars->set(static_cast<int>(chars_to_trim->ptr[i]), true);
-  }
+  trim_ctx->Reset(*chars_to_trim);
 }
 
 void StringFunctions::TrimClose(
     FunctionContext* context, FunctionContext::FunctionStateScope scope) {
   if (scope != FunctionContext::THREAD_LOCAL) return;
-  bitset<256>* unique_chars = reinterpret_cast<bitset<256>*>(
-      context->GetFunctionState(scope));
-  delete unique_chars;
+  TrimContext* trim_ctx =
+      reinterpret_cast<TrimContext*>(context->GetFunctionState(scope));
+  delete trim_ctx;
   context->SetFunctionState(scope, nullptr);
 }
 
@@ -726,36 +782,69 @@ template <StringFunctions::TrimPosition D, bool 
IS_IMPLICIT_WHITESPACE>
 StringVal StringFunctions::DoTrimString(FunctionContext* ctx,
     const StringVal& str, const StringVal& chars_to_trim) {
   if (str.is_null) return StringVal::null();
-  bitset<256>* unique_chars = reinterpret_cast<bitset<256>*>(
+  TrimContext* trim_ctx = reinterpret_cast<TrimContext*>(
       ctx->GetFunctionState(FunctionContext::THREAD_LOCAL));
-  // When 'chars_to_trim' is unique for each element (e.g. when 'chars_to_trim'
-  // is each element of a table column), we need to prepare a bitset of unique
-  // characters here instead of using the bitset from function context.
+
+  // When 'chars_to_trim' is not a constant, we need to reset TrimContext with 
new
+  // 'chars_to_trim'.
   if (!IS_IMPLICIT_WHITESPACE && !ctx->IsArgConstant(1)) {
     if (chars_to_trim.is_null) return str;
-    unique_chars->reset();
-    for (int32_t i = 0; i < chars_to_trim.len; ++i) {
-      unique_chars->set(static_cast<int>(chars_to_trim.ptr[i]), true);
-    }
+    trim_ctx->Reset(chars_to_trim);
   }
-  // Find new starting position.
+
+  // When dealing with UTF-8 characters in UTF-8 mode, use DoUtf8TrimString().
+  if (trim_ctx->utf8_mode()) {
+    return DoUtf8TrimString<D>(str, *trim_ctx);
+  }
+
+  // Otherwise, we continue to maintain the old behavior.
   int32_t begin = 0;
   int32_t end = str.len - 1;
-  if (D == LEADING || D == BOTH) {
-    while (begin < str.len &&
-        unique_chars->test(static_cast<int>(str.ptr[begin]))) {
+  // Find new starting position.
+  if constexpr (D == LEADING || D == BOTH) {
+    while (begin < str.len && trim_ctx->Contains(str.ptr[begin])) {
       ++begin;
     }
   }
   // Find new ending position.
-  if (D == TRAILING || D == BOTH) {
-    while (end >= begin && unique_chars->test(static_cast<int>(str.ptr[end]))) 
{
+  if constexpr (D == TRAILING || D == BOTH) {
+    while (end >= begin && trim_ctx->Contains(str.ptr[end])) {
       --end;
     }
   }
   return StringVal(str.ptr + begin, end - begin + 1);
 }
 
+template <StringFunctions::TrimPosition D>
+StringVal StringFunctions::DoUtf8TrimString(const StringVal& str,
+    const TrimContext& trim_ctx) {
+  if (UNLIKELY(str.len == 0)) return str;
+
+  const uint8_t* begin = str.ptr;
+  const uint8_t* end = begin + str.len;
+  // Find new starting position.
+  if constexpr (D == LEADING || D == BOTH) {
+    while (begin < end) {
+      size_t char_size = BitUtil::NumBytesInUtf8Encoding(*begin);
+      if (UNLIKELY(begin + char_size > end)) char_size = end - begin;
+      if (!trim_ctx.Contains(begin, char_size)) break;
+      begin += char_size;
+    }
+  }
+  // Find new ending position.
+  if constexpr (D == TRAILING || D == BOTH) {
+    while (begin < end) {
+      int char_index = FindUtf8PosBackward(begin, end - begin, 0);
+      DCHECK_NE(char_index, -1);
+      const uint8_t* char_begin = begin + char_index;
+      if (!trim_ctx.Contains(char_begin, end - char_begin)) break;
+      end = char_begin;
+    }
+  }
+
+  return StringVal(const_cast<uint8_t*>(begin), end - begin);
+}
+
 StringVal StringFunctions::Trim(FunctionContext* context, const StringVal& 
str) {
   return DoTrimString<BOTH, true>(context, str, StringVal(" "));
 }
diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h
index 1deb8b9fa..3b6da4f56 100644
--- a/be/src/exprs/string-functions.h
+++ b/be/src/exprs/string-functions.h
@@ -54,6 +54,38 @@ class StringFunctions {
     TRAILING, // Trim from the right, or trailing end
     BOTH // Trim from both ends of string
   };
+
+  // A utility class for supporting the UTF-8 Trim() function, initialized 
with the input
+  // string to be trimmed. After Reset(), the Contains function can be used to 
determine
+  // if a character needs to be trimmed.
+  class TrimContext {
+   public:
+    TrimContext(bool utf8_mode) : utf8_mode_(utf8_mode) { }
+
+    void Reset(const StringVal& chars_to_trim);
+
+    inline bool Contains(uint8_t single_char) const {
+      return single_byte_chars_.test(single_char);
+    }
+
+    inline bool Contains(const uint8_t* utf8_char, int len) const;
+
+    bool utf8_mode() const { return utf8_mode_; }
+
+   private:
+    const bool utf8_mode_;
+
+    // The bitset to hold the unique characters to trim, used for non-UTF-8 
characters
+    // or single-byte UTF-8 characters.
+    std::bitset<256> single_byte_chars_;
+
+    // Pointers to multi-byte UTF-8 characters used to check whether 
characters of the
+    // corresponding byte count need to be trimmed.
+    std::vector<const uint8_t*> double_byte_chars_;
+    std::vector<const uint8_t*> triple_byte_chars_;
+    std::vector<const uint8_t*> quadruple_byte_chars_;
+  };
+
   static StringVal Substring(FunctionContext*, const StringVal& str, const 
BigIntVal& pos,
       const BigIntVal& len);
   static StringVal Substring(FunctionContext*, const StringVal& str,
@@ -99,6 +131,7 @@ class StringFunctions {
 
   /// Sets up arguments and function context for the *TrimString functions 
below.
   static void TrimPrepare(FunctionContext*, 
FunctionContext::FunctionStateScope);
+  static void Utf8TrimPrepare(FunctionContext*, 
FunctionContext::FunctionStateScope);
   /// Cleans up the work done by TrimPrepare above.
   static void TrimClose(FunctionContext*, FunctionContext::FunctionStateScope);
 
@@ -205,6 +238,10 @@ class StringFunctions {
 
  private:
   static uint64_t re2_mem_limit_;
+
+  static void DoTrimPrepare(FunctionContext* context,
+      FunctionContext::FunctionStateScope scope, bool utf8_mode);
+
   /// Templatized implementation of the actual string trimming function.
   /// The first parameter, 'D', is one of StringFunctions::TrimPosition values.
   /// The second parameter, 'IS_IMPLICIT_WHITESPACE', is true when the set of 
characters
@@ -213,6 +250,12 @@ class StringFunctions {
   template <TrimPosition D, bool IS_IMPLICIT_WHITESPACE>
   static StringVal DoTrimString(FunctionContext* ctx, const StringVal& str,
       const StringVal& chars_to_trim);
+
+  /// Templatized implementation of the actual string trimming function with 
UTF-8
+  /// character handling.
+  /// The first parameter, 'D', is one of the values of 
StringFunctions::TrimPosition.
+  template <StringFunctions::TrimPosition D>
+  static StringVal DoUtf8TrimString(const StringVal& str, const TrimContext& 
trim_ctx);
 };
 }
 #endif
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index 86ceac272..2333c547c 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -121,21 +121,54 @@ class BitUtil {
   constexpr static inline uint32_t RoundDownNumi64(uint32_t bits) { return 
bits >> 6; }
 
   /// Returns whether the given byte is the start byte of a UTF-8 character.
-  constexpr static inline bool IsUtf8StartByte(uint8_t b) {
+  constexpr static inline bool IsUtf8StartByteRaw(uint8_t b) {
+    // If the byte is not 10xxxxxx, it is a start byte.
     return (b & 0xC0) != 0x80;
   }
 
   /// Returns the byte length of a *legal* UTF-8 character (code point) given 
its first
-  /// byte. If the first byte is between 0xC0 and 0xDF, the UTF-8 character 
has two
-  /// bytes; if it is between 0xE0 and 0xEF, the UTF-8 character has 3 bytes; 
and if it
-  /// is 0xF0 and 0xFF, the UTF-8 character has 4 bytes.
-  constexpr static inline int NumBytesInUTF8Encoding(int8_t first_byte) {
-    if (first_byte >= 0) return 1;
-    switch (first_byte & 0xF0) {
-      case 0xE0: return 3;
-      case 0xF0: return 4;
-      default: return 2;
-    }
+  /// byte. The mapping table is as follows:
+  ///   Char. number range  |        UTF-8 octet sequence
+  ///      (hexadecimal)    |              (binary)
+  ///   --------------------+---------------------------------------------
+  ///   0000 0000-0000 007F | 0xxxxxxx
+  ///   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+  ///   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+  ///   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+  /// See more details: https://www.rfc-editor.org/rfc/rfc3629.html#section-3
+  constexpr static inline int8_t NumBytesInUtf8EncodingRaw(uint8_t first_byte) 
{
+    // If the first byte is 0xxxxxxx, the UTF-8 character has 1 byte.
+    if ((first_byte & 0x80) == 0x00) return 1;
+    // If the first byte is 110xxxxx, the UTF-8 character has 2 bytes.
+    if ((first_byte & 0xE0) == 0xC0) return 2;
+    // If the first byte is 1110xxxx, the UTF-8 character has 3 bytes.
+    if ((first_byte & 0xF0) == 0xE0) return 3;
+    // If the first byte is 11110xxx, the UTF-8 character has 4 bytes.
+    if ((first_byte & 0xF8) == 0xF0) return 4;
+    // Otherwise, the byte is continuation (10xxxxxx) or invalid (11111xxx),
+    // just returns 1 for safe.
+    return 1;
+  }
+
+  template <class T, size_t... Is>
+  constexpr static auto GenerateTable(T(*func)(uint8_t), 
std::index_sequence<Is...>) {
+    return std::array<T, sizeof...(Is)>{{func(Is)...}};
+  }
+
+  /// Same as IsUtf8StartByteRaw() but returns the result using the byte value 
as an index
+  /// from a precalculated table.
+  static inline bool IsUtf8StartByte(uint8_t byte) {
+    constexpr static auto is_utf8_start_byte_table =
+        GenerateTable(IsUtf8StartByteRaw, std::make_index_sequence<256>{});
+    return is_utf8_start_byte_table[byte];
+  }
+
+  /// Same as NumBytesInUtf8EncodingRaw() but returns the byte length of a 
UTF-8 character
+  /// using the first byte value as an index from a precalculated table.
+  static inline int NumBytesInUtf8Encoding(uint8_t first_byte) {
+    constexpr static auto utf8_byte_length_table =
+        GenerateTable(NumBytesInUtf8EncodingRaw, 
std::make_index_sequence<256>{});
+    return utf8_byte_length_table[first_byte];
   }
 
   /// Non hw accelerated pop count.
diff --git a/be/src/util/string-util.cc b/be/src/util/string-util.cc
index 676f01ee6..5394b1ffd 100644
--- a/be/src/util/string-util.cc
+++ b/be/src/util/string-util.cc
@@ -102,7 +102,7 @@ int FindUtf8PosForward(const uint8_t* ptr, const int len, 
int index) {
       --index;
     }
     if (index == 0 || pos == len) break;
-    pos += BitUtil::NumBytesInUTF8Encoding(ptr[pos]);
+    pos += BitUtil::NumBytesInUtf8Encoding(ptr[pos]);
     --index;
   }
   if (pos >= len) return len;
@@ -123,7 +123,7 @@ int FindUtf8PosBackward(const uint8_t* ptr, const int len, 
int index) {
       return -1;
     }
     // Get bytes length of the located character.
-    int bytes_len = BitUtil::NumBytesInUTF8Encoding(ptr[pos]);
+    int bytes_len = BitUtil::NumBytesInUtf8Encoding(ptr[pos]);
     // If there are not enough bytes after the first byte, i.e. last_pos-pos < 
bytes_len,
     // we consider the bytes belong to a malformed character, and count them 
as one
     // character.
diff --git a/common/function-registry/impala_functions.py 
b/common/function-registry/impala_functions.py
index 7a5516e34..71d48814c 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -551,6 +551,21 @@ visible_functions = [
   [['rtrim'], 'STRING', ['STRING', 'STRING'], 
'impala::StringFunctions::RTrimString',
    
'_ZN6impala15StringFunctions11TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
    
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+  [['utf8_trim'], 'STRING', ['STRING'], 'impala::StringFunctions::Trim',
+   
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+   
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+  [['utf8_ltrim'], 'STRING', ['STRING'], 'impala::StringFunctions::Ltrim',
+   
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+   
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+  [['utf8_rtrim'], 'STRING', ['STRING'], 'impala::StringFunctions::Rtrim',
+   
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+   
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+  [['utf8_ltrim'], 'STRING', ['STRING', 'STRING'], 
'impala::StringFunctions::LTrimString',
+   
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+   
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+  [['utf8_rtrim'], 'STRING', ['STRING', 'STRING'], 
'impala::StringFunctions::RTrimString',
+   
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+   
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
   [['ascii'], 'INT', ['STRING'], 'impala::StringFunctions::Ascii'],
   [['instr'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Instr'],
   [['instr'], 'INT', ['STRING', 'STRING', 'BIGINT'], 
'impala::StringFunctions::Instr'],
@@ -603,6 +618,12 @@ visible_functions = [
   [['btrim'], 'STRING', ['STRING', 'STRING'], 
'impala::StringFunctions::BTrimString',
    
'_ZN6impala15StringFunctions11TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
    
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+  [['utf8_btrim'], 'STRING', ['STRING'], 'impala::StringFunctions::Trim',
+   
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+   
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+  [['utf8_btrim'], 'STRING', ['STRING', 'STRING'], 
'impala::StringFunctions::BTrimString',
+   
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+   
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
   [['get_json_object'], 'STRING', ['STRING', 'STRING'],
    'impala::StringFunctions::GetJsonObject'],
   [['levenshtein', 'le_dst'], 'INT', ['STRING', 'STRING'],
diff --git 
a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
 
b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
index 9417e5ad4..7a4690113 100644
--- 
a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
+++ 
b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
@@ -310,3 +310,88 @@ select id, upper(name), lower(name), initcap(name) from 
utf8_str_tiny;
 ---- TYPES
 INT,STRING,STRING,STRING
 ====
+---- QUERY
+set utf8_mode=true;
+select id, name, ltrim(name, substr(name, 1, 1)) from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','三'
+2,'李四','四'
+3,'王五','五'
+4,'李小龙','小龙'
+5,'Alice','lice'
+6,'陈Bob','Bob'
+7,'Бopиc','opиc'
+8,'Jörg','örg'
+9,'ひなた','なた'
+10,'서연','연'
+---- TYPES
+INT,STRING,STRING
+====
+---- QUERY
+set utf8_mode=true;
+select id, name, rtrim(name, substr(name, 2)) from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','张'
+2,'李四','李'
+3,'王五','王'
+4,'李小龙','李'
+5,'Alice','A'
+6,'陈Bob','陈'
+7,'Бopиc','Б'
+8,'Jörg','J'
+9,'ひなた','ひ'
+10,'서연','서'
+---- TYPES
+INT,STRING,STRING
+====
+---- QUERY
+set utf8_mode=true;
+select id, name, ltrim(name, '张李王小A陈БJひ서') from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','三'
+2,'李四','四'
+3,'王五','五'
+4,'李小龙','龙'
+5,'Alice','lice'
+6,'陈Bob','Bob'
+7,'Бopиc','opиc'
+8,'Jörg','örg'
+9,'ひなた','なた'
+10,'서연','연'
+---- TYPES
+INT,STRING,STRING
+====
+---- QUERY
+set utf8_mode=true;
+select id, name, rtrim(name, '三四五小龙') from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','张'
+2,'李四','李'
+3,'王五','王'
+4,'李小龙','李'
+5,'Alice','Alice'
+6,'陈Bob','陈Bob'
+7,'Бopиc','Бopиc'
+8,'Jörg','Jörg'
+9,'ひなた','ひなた'
+10,'서연','서연'
+---- TYPES
+INT,STRING,STRING
+====
+---- QUERY
+set utf8_mode=true;
+select id, name, btrim(name, '！？。，：；‘’“”≠≥≤∞ε∑∫√') from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','张三'
+2,'李四','李四'
+3,'王五','王五'
+4,'李小龙','李小龙'
+5,'Alice','Alice'
+6,'陈Bob','陈Bob'
+7,'Бopиc','Бopиc'
+8,'Jörg','Jörg'
+9,'ひなた','ひなた'
+10,'서연','서연'
+---- TYPES
+INT,STRING,STRING
+====

(impala) branch master updated: IMPALA-12718: Provides UTF-8 support for the trim functions

Reply via email to