This is an automated email from the ASF dual-hosted git repository.
stigahuang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git
The following commit(s) were added to refs/heads/master by this push:
new e489ab35b IMPALA-12718: Provides UTF-8 support for the trim functions
e489ab35b is described below
commit e489ab35b1610aa387bc10caf466bbe0aafba19b
Author: Eyizoha <[email protected]>
AuthorDate: Fri Jan 19 18:22:30 2024 +0800
IMPALA-12718: Provides UTF-8 support for the trim functions
Currently, the trim function (including BTRIM, LTRIM, RTRIM) cannot
correctly handle strings containing multi-byte UTF-8 characters.
Multi-byte UTF-8 characters are interpreted as multiple single-byte
characters, leading to unexpected results.
This patch provides UTF-8 support for the trim functions, enabling these
functions to correctly handle multi-byte UTF-8 characters (when set
utf8_mode=true). It also introduces a set of trim functions with the
'utf8_' prefix, offering the same capability even when utf8_mode is not
enabled.
Testing:
- Added new BE test case in ExprTest#Utf8Test
- Added new E2E test case in TestUtf8StringFunctions
Change-Id: I5cfaffd71009f16eae75910af835bd2a34410856
Reviewed-on: http://gerrit.cloudera.org:8080/20926
Reviewed-by: Impala Public Jenkins <[email protected]>
Tested-by: Impala Public Jenkins <[email protected]>
---
be/src/exprs/expr-test.cc | 32 +++++
be/src/exprs/string-functions-ir.cc | 141 +++++++++++++++++----
be/src/exprs/string-functions.h | 43 +++++++
be/src/util/bit-util.h | 55 ++++++--
be/src/util/string-util.cc | 4 +-
common/function-registry/impala_functions.py | 21 +++
.../queries/QueryTest/utf8-string-functions.test | 85 +++++++++++++
7 files changed, 342 insertions(+), 39 deletions(-)
diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc
index a26a5ecf4..52b6b0a3d 100644
--- a/be/src/exprs/expr-test.cc
+++ b/be/src/exprs/expr-test.cc
@@ -10987,6 +10987,15 @@ TEST_P(ExprTest, Utf8Test) {
"\U0001f467\u200d\U0001f467\u200d\U0001f468\u200d\U0001f468"
"\u0bbf\u0ba8\u0940\u0928");
+ // Tests utf8_*trim() with UTF-8 characters.
+ TestStringValue("utf8_trim(' hello你好👋 ')", "hello你好👋");
+ TestStringValue("utf8_rtrim(' hello你好👋 ')", " hello你好👋");
+ TestStringValue("utf8_ltrim(' hello你好👋 ')", "hello你好👋 ");
+ TestStringValue("utf8_btrim(' hello你好👋 ')", "hello你好👋");
+ TestStringValue("utf8_rtrim('hello你好👋', '👋hello')", "hello你好");
+ TestStringValue("utf8_ltrim('hello你好👋', '👋hello')", "你好👋");
+ TestStringValue("utf8_btrim('hello你好👋', '👋hello')", "你好");
+
executor_->PushExecOption("utf8_mode=true");
// Each Chinese character is encoded into 3 bytes. But in UTF-8 mode, the
positions
// are counted by UTF-8 characters.
@@ -11099,6 +11108,29 @@ TEST_P(ExprTest, Utf8Test) {
TestStringValue("lower('ÁBĆ\\0ÈFĞ')", string("ábć\0èfğ", 11));
TestStringValue("initcap('ábć\\0ÈFĞ')", string("Ábć\0èfğ", 11));
+ // Tests *trim() with UTF-8 characters in UTF8_MODE.
+ TestStringValue("trim(' hello 你好 👋 ')", "hello 你好 👋");
+ TestStringValue("ltrim(' hello 你好 👋 ')", "hello 你好 👋 ");
+ TestStringValue("rtrim(' hello 你好 👋 ')", " hello 你好 👋");
+ TestStringValue("btrim(' hello 你好 👋 ')", "hello 你好 👋");
+
+ TestStringValue("ltrim('ÁáBbĆć', 'ÁBĆábć')", "");
+ TestStringValue("rtrim('price价格,', ',')", "price价格");
+
+ TestStringValue("rtrim('hello你好👋', '👋hello')", "hello你好");
+ TestStringValue("ltrim('hello你好👋', '👋hello')", "你好👋");
+ TestStringValue("btrim('hello你好👋', '👋hello')", "你好");
+
+ TestStringValue("rtrim('🍎🍐🍊🍋🍌', '🍌🍋🍐🍎')", "🍎🍐🍊");
+ TestStringValue("ltrim('🍎🍐🍊🍋🍌', '🍌🍋🍐🍎')", "🍊🍋🍌");
+ TestStringValue("btrim('🍎🍐🍊🍋🍌', '🍌🍋🍐🍎')", "🍊");
+
+ TestStringValue("btrim('water💧水вода', 'вода水💧water')", "");
+ TestStringValue("btrim('fire🔥火огонь', 'огонь火🔥fire')", "");
+
+ // There are 'Zero Width Joiner' between emojis.
+ TestStringValue("btrim('👨👩👧👦', '👧👦')", "👨👩");
+
executor_->PopExecOption();
}
diff --git a/be/src/exprs/string-functions-ir.cc
b/be/src/exprs/string-functions-ir.cc
index ad34fcd64..de8c8d7a5 100644
--- a/be/src/exprs/string-functions-ir.cc
+++ b/be/src/exprs/string-functions-ir.cc
@@ -42,6 +42,7 @@
using namespace impala_udf;
using std::bitset;
+using std::any_of;
// NOTE: be careful not to use string::append. It is not performant.
namespace impala {
@@ -690,35 +691,90 @@ StringVal StringFunctions::Translate(FunctionContext*
context, const StringVal&
return result;
}
-void StringFunctions::TrimPrepare(
- FunctionContext* context, FunctionContext::FunctionStateScope scope) {
+void StringFunctions::TrimContext::Reset(const StringVal& chars_to_trim) {
+ single_byte_chars_.reset();
+ double_byte_chars_.clear();
+ triple_byte_chars_.clear();
+ quadruple_byte_chars_.clear();
+
+ if (!utf8_mode_) {
+ for (size_t i = 0; i < chars_to_trim.len; ++i) {
+ single_byte_chars_.set(chars_to_trim.ptr[i], true);
+ }
+ return;
+ }
+
+ for (size_t i = 0, char_size = 0; i < chars_to_trim.len; i += char_size) {
+ char_size = BitUtil::NumBytesInUtf8Encoding(chars_to_trim.ptr[i]);
+
+ // If the remaining number of bytes does not match the number of bytes
specified by
+ // the UTF-8 character, we may have encountered an illegal UTF-8 character.
+ // In order to prevent subsequent data access from going out of bounds,
restrictions
+ // are placed here to ensure that accessing pointers to multi-byte
characters is
+ // always safe.
+ if (UNLIKELY(i + char_size > chars_to_trim.len)) {
+ char_size = chars_to_trim.len - i;
+ }
+
+ switch (char_size) {
+ case 1: single_byte_chars_.set(chars_to_trim.ptr[i], true); break;
+ case 2: double_byte_chars_.push_back(&chars_to_trim.ptr[i]); break;
+ case 3: triple_byte_chars_.push_back(&chars_to_trim.ptr[i]); break;
+ case 4: quadruple_byte_chars_.push_back(&chars_to_trim.ptr[i]); break;
+ default: DCHECK(false); break;
+ }
+ }
+}
+
+bool StringFunctions::TrimContext::Contains(const uint8_t* utf8_char, int len)
const {
+ auto eq = [&](const uint8_t* c){ return memcmp(c, utf8_char, len) == 0; };
+ switch (len) {
+ case 1: return single_byte_chars_.test(*utf8_char);
+ case 2: return any_of(double_byte_chars_.begin(),
double_byte_chars_.end(), eq);
+ case 3: return any_of(triple_byte_chars_.begin(),
triple_byte_chars_.end(), eq);
+ case 4: return any_of(quadruple_byte_chars_.begin(),
quadruple_byte_chars_.end(), eq);
+ default: DCHECK(false); return false;
+ }
+}
+
+void StringFunctions::TrimPrepare(FunctionContext* context,
+ FunctionContext::FunctionStateScope scope) {
+ bool utf8_mode =
context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE);
+ DoTrimPrepare(context, scope, utf8_mode);
+}
+
+void StringFunctions::Utf8TrimPrepare(FunctionContext* context,
+ FunctionContext::FunctionStateScope scope) {
+ DoTrimPrepare(context, scope, true /* utf8_mode */);
+}
+
+void StringFunctions::DoTrimPrepare(FunctionContext* context,
+ FunctionContext::FunctionStateScope scope, bool utf8_mode) {
if (scope != FunctionContext::THREAD_LOCAL) return;
- // Create a bitset to hold the unique characters to trim.
- bitset<256>* unique_chars = new bitset<256>();
- context->SetFunctionState(scope, unique_chars);
+ TrimContext* trim_ctx = new TrimContext(utf8_mode);
+ context->SetFunctionState(scope, trim_ctx);
+
// If the caller didn't specify the set of characters to trim, it means
// that we're only trimming whitespace. Return early in that case.
// There can be either 1 or 2 arguments.
DCHECK(context->GetNumArgs() == 1 || context->GetNumArgs() == 2);
if (context->GetNumArgs() == 1) {
- unique_chars->set(static_cast<int>(' '), true);
+ trim_ctx->Reset(StringVal(" "));
return;
}
if (!context->IsArgConstant(1)) return;
DCHECK_EQ(context->GetArgType(1)->type, FunctionContext::TYPE_STRING);
StringVal* chars_to_trim =
reinterpret_cast<StringVal*>(context->GetConstantArg(1));
if (chars_to_trim->is_null) return; // We shouldn't peek into Null StringVals
- for (int32_t i = 0; i < chars_to_trim->len; ++i) {
- unique_chars->set(static_cast<int>(chars_to_trim->ptr[i]), true);
- }
+ trim_ctx->Reset(*chars_to_trim);
}
void StringFunctions::TrimClose(
FunctionContext* context, FunctionContext::FunctionStateScope scope) {
if (scope != FunctionContext::THREAD_LOCAL) return;
- bitset<256>* unique_chars = reinterpret_cast<bitset<256>*>(
- context->GetFunctionState(scope));
- delete unique_chars;
+ TrimContext* trim_ctx =
+ reinterpret_cast<TrimContext*>(context->GetFunctionState(scope));
+ delete trim_ctx;
context->SetFunctionState(scope, nullptr);
}
@@ -726,36 +782,69 @@ template <StringFunctions::TrimPosition D, bool
IS_IMPLICIT_WHITESPACE>
StringVal StringFunctions::DoTrimString(FunctionContext* ctx,
const StringVal& str, const StringVal& chars_to_trim) {
if (str.is_null) return StringVal::null();
- bitset<256>* unique_chars = reinterpret_cast<bitset<256>*>(
+ TrimContext* trim_ctx = reinterpret_cast<TrimContext*>(
ctx->GetFunctionState(FunctionContext::THREAD_LOCAL));
- // When 'chars_to_trim' is unique for each element (e.g. when 'chars_to_trim'
- // is each element of a table column), we need to prepare a bitset of unique
- // characters here instead of using the bitset from function context.
+
+ // When 'chars_to_trim' is not a constant, we need to reset TrimContext with
new
+ // 'chars_to_trim'.
if (!IS_IMPLICIT_WHITESPACE && !ctx->IsArgConstant(1)) {
if (chars_to_trim.is_null) return str;
- unique_chars->reset();
- for (int32_t i = 0; i < chars_to_trim.len; ++i) {
- unique_chars->set(static_cast<int>(chars_to_trim.ptr[i]), true);
- }
+ trim_ctx->Reset(chars_to_trim);
}
- // Find new starting position.
+
+ // When dealing with UTF-8 characters in UTF-8 mode, use DoUtf8TrimString().
+ if (trim_ctx->utf8_mode()) {
+ return DoUtf8TrimString<D>(str, *trim_ctx);
+ }
+
+ // Otherwise, we continue to maintain the old behavior.
int32_t begin = 0;
int32_t end = str.len - 1;
- if (D == LEADING || D == BOTH) {
- while (begin < str.len &&
- unique_chars->test(static_cast<int>(str.ptr[begin]))) {
+ // Find new starting position.
+ if constexpr (D == LEADING || D == BOTH) {
+ while (begin < str.len && trim_ctx->Contains(str.ptr[begin])) {
++begin;
}
}
// Find new ending position.
- if (D == TRAILING || D == BOTH) {
- while (end >= begin && unique_chars->test(static_cast<int>(str.ptr[end])))
{
+ if constexpr (D == TRAILING || D == BOTH) {
+ while (end >= begin && trim_ctx->Contains(str.ptr[end])) {
--end;
}
}
return StringVal(str.ptr + begin, end - begin + 1);
}
+template <StringFunctions::TrimPosition D>
+StringVal StringFunctions::DoUtf8TrimString(const StringVal& str,
+ const TrimContext& trim_ctx) {
+ if (UNLIKELY(str.len == 0)) return str;
+
+ const uint8_t* begin = str.ptr;
+ const uint8_t* end = begin + str.len;
+ // Find new starting position.
+ if constexpr (D == LEADING || D == BOTH) {
+ while (begin < end) {
+ size_t char_size = BitUtil::NumBytesInUtf8Encoding(*begin);
+ if (UNLIKELY(begin + char_size > end)) char_size = end - begin;
+ if (!trim_ctx.Contains(begin, char_size)) break;
+ begin += char_size;
+ }
+ }
+ // Find new ending position.
+ if constexpr (D == TRAILING || D == BOTH) {
+ while (begin < end) {
+ int char_index = FindUtf8PosBackward(begin, end - begin, 0);
+ DCHECK_NE(char_index, -1);
+ const uint8_t* char_begin = begin + char_index;
+ if (!trim_ctx.Contains(char_begin, end - char_begin)) break;
+ end = char_begin;
+ }
+ }
+
+ return StringVal(const_cast<uint8_t*>(begin), end - begin);
+}
+
StringVal StringFunctions::Trim(FunctionContext* context, const StringVal&
str) {
return DoTrimString<BOTH, true>(context, str, StringVal(" "));
}
diff --git a/be/src/exprs/string-functions.h b/be/src/exprs/string-functions.h
index 1deb8b9fa..3b6da4f56 100644
--- a/be/src/exprs/string-functions.h
+++ b/be/src/exprs/string-functions.h
@@ -54,6 +54,38 @@ class StringFunctions {
TRAILING, // Trim from the right, or trailing end
BOTH // Trim from both ends of string
};
+
+ // A utility class for supporting the UTF-8 Trim() function, initialized
with the input
+ // string to be trimmed. After Reset(), the Contains function can be used to
determine
+ // if a character needs to be trimmed.
+ class TrimContext {
+ public:
+ TrimContext(bool utf8_mode) : utf8_mode_(utf8_mode) { }
+
+ void Reset(const StringVal& chars_to_trim);
+
+ inline bool Contains(uint8_t single_char) const {
+ return single_byte_chars_.test(single_char);
+ }
+
+ inline bool Contains(const uint8_t* utf8_char, int len) const;
+
+ bool utf8_mode() const { return utf8_mode_; }
+
+ private:
+ const bool utf8_mode_;
+
+ // The bitset to hold the unique characters to trim, used for non-UTF-8
characters
+ // or single-byte UTF-8 characters.
+ std::bitset<256> single_byte_chars_;
+
+ // Pointers to multi-byte UTF-8 characters used to check whether
characters of the
+ // corresponding byte count need to be trimmed.
+ std::vector<const uint8_t*> double_byte_chars_;
+ std::vector<const uint8_t*> triple_byte_chars_;
+ std::vector<const uint8_t*> quadruple_byte_chars_;
+ };
+
static StringVal Substring(FunctionContext*, const StringVal& str, const
BigIntVal& pos,
const BigIntVal& len);
static StringVal Substring(FunctionContext*, const StringVal& str,
@@ -99,6 +131,7 @@ class StringFunctions {
/// Sets up arguments and function context for the *TrimString functions
below.
static void TrimPrepare(FunctionContext*,
FunctionContext::FunctionStateScope);
+ static void Utf8TrimPrepare(FunctionContext*,
FunctionContext::FunctionStateScope);
/// Cleans up the work done by TrimPrepare above.
static void TrimClose(FunctionContext*, FunctionContext::FunctionStateScope);
@@ -205,6 +238,10 @@ class StringFunctions {
private:
static uint64_t re2_mem_limit_;
+
+ static void DoTrimPrepare(FunctionContext* context,
+ FunctionContext::FunctionStateScope scope, bool utf8_mode);
+
/// Templatized implementation of the actual string trimming function.
/// The first parameter, 'D', is one of StringFunctions::TrimPosition values.
/// The second parameter, 'IS_IMPLICIT_WHITESPACE', is true when the set of
characters
@@ -213,6 +250,12 @@ class StringFunctions {
template <TrimPosition D, bool IS_IMPLICIT_WHITESPACE>
static StringVal DoTrimString(FunctionContext* ctx, const StringVal& str,
const StringVal& chars_to_trim);
+
+ /// Templatized implementation of the actual string trimming function with
UTF-8
+ /// character handling.
+ /// The first parameter, 'D', is one of the values of
StringFunctions::TrimPosition.
+ template <StringFunctions::TrimPosition D>
+ static StringVal DoUtf8TrimString(const StringVal& str, const TrimContext&
trim_ctx);
};
}
#endif
diff --git a/be/src/util/bit-util.h b/be/src/util/bit-util.h
index 86ceac272..2333c547c 100644
--- a/be/src/util/bit-util.h
+++ b/be/src/util/bit-util.h
@@ -121,21 +121,54 @@ class BitUtil {
constexpr static inline uint32_t RoundDownNumi64(uint32_t bits) { return
bits >> 6; }
/// Returns whether the given byte is the start byte of a UTF-8 character.
- constexpr static inline bool IsUtf8StartByte(uint8_t b) {
+ constexpr static inline bool IsUtf8StartByteRaw(uint8_t b) {
+ // If the byte is not 10xxxxxx, it is a start byte.
return (b & 0xC0) != 0x80;
}
/// Returns the byte length of a *legal* UTF-8 character (code point) given
its first
- /// byte. If the first byte is between 0xC0 and 0xDF, the UTF-8 character
has two
- /// bytes; if it is between 0xE0 and 0xEF, the UTF-8 character has 3 bytes;
and if it
- /// is 0xF0 and 0xFF, the UTF-8 character has 4 bytes.
- constexpr static inline int NumBytesInUTF8Encoding(int8_t first_byte) {
- if (first_byte >= 0) return 1;
- switch (first_byte & 0xF0) {
- case 0xE0: return 3;
- case 0xF0: return 4;
- default: return 2;
- }
+ /// byte. The mapping table is as follows:
+ /// Char. number range | UTF-8 octet sequence
+ /// (hexadecimal) | (binary)
+ /// --------------------+---------------------------------------------
+ /// 0000 0000-0000 007F | 0xxxxxxx
+ /// 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+ /// 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+ /// 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ /// See more details: https://www.rfc-editor.org/rfc/rfc3629.html#section-3
+ constexpr static inline int8_t NumBytesInUtf8EncodingRaw(uint8_t first_byte)
{
+ // If the first byte is 0xxxxxxx, the UTF-8 character has 1 byte.
+ if ((first_byte & 0x80) == 0x00) return 1;
+ // If the first byte is 110xxxxx, the UTF-8 character has 2 bytes.
+ if ((first_byte & 0xE0) == 0xC0) return 2;
+ // If the first byte is 1110xxxx, the UTF-8 character has 3 bytes.
+ if ((first_byte & 0xF0) == 0xE0) return 3;
+ // If the first byte is 11110xxx, the UTF-8 character has 4 bytes.
+ if ((first_byte & 0xF8) == 0xF0) return 4;
+ // Otherwise, the byte is continuation (10xxxxxx) or invalid (11111xxx),
+ // just returns 1 for safe.
+ return 1;
+ }
+
+ template <class T, size_t... Is>
+ constexpr static auto GenerateTable(T(*func)(uint8_t),
std::index_sequence<Is...>) {
+ return std::array<T, sizeof...(Is)>{{func(Is)...}};
+ }
+
+ /// Same as IsUtf8StartByteRaw() but returns the result using the byte value
as an index
+ /// from a precalculated table.
+ static inline bool IsUtf8StartByte(uint8_t byte) {
+ constexpr static auto is_utf8_start_byte_table =
+ GenerateTable(IsUtf8StartByteRaw, std::make_index_sequence<256>{});
+ return is_utf8_start_byte_table[byte];
+ }
+
+ /// Same as NumBytesInUtf8EncodingRaw() but returns the byte length of a
UTF-8 character
+ /// using the first byte value as an index from a precalculated table.
+ static inline int NumBytesInUtf8Encoding(uint8_t first_byte) {
+ constexpr static auto utf8_byte_length_table =
+ GenerateTable(NumBytesInUtf8EncodingRaw,
std::make_index_sequence<256>{});
+ return utf8_byte_length_table[first_byte];
}
/// Non hw accelerated pop count.
diff --git a/be/src/util/string-util.cc b/be/src/util/string-util.cc
index 676f01ee6..5394b1ffd 100644
--- a/be/src/util/string-util.cc
+++ b/be/src/util/string-util.cc
@@ -102,7 +102,7 @@ int FindUtf8PosForward(const uint8_t* ptr, const int len,
int index) {
--index;
}
if (index == 0 || pos == len) break;
- pos += BitUtil::NumBytesInUTF8Encoding(ptr[pos]);
+ pos += BitUtil::NumBytesInUtf8Encoding(ptr[pos]);
--index;
}
if (pos >= len) return len;
@@ -123,7 +123,7 @@ int FindUtf8PosBackward(const uint8_t* ptr, const int len,
int index) {
return -1;
}
// Get bytes length of the located character.
- int bytes_len = BitUtil::NumBytesInUTF8Encoding(ptr[pos]);
+ int bytes_len = BitUtil::NumBytesInUtf8Encoding(ptr[pos]);
// If there are not enough bytes after the first byte, i.e. last_pos-pos <
bytes_len,
// we consider the bytes belong to a malformed character, and count them
as one
// character.
diff --git a/common/function-registry/impala_functions.py
b/common/function-registry/impala_functions.py
index 7a5516e34..71d48814c 100644
--- a/common/function-registry/impala_functions.py
+++ b/common/function-registry/impala_functions.py
@@ -551,6 +551,21 @@ visible_functions = [
[['rtrim'], 'STRING', ['STRING', 'STRING'],
'impala::StringFunctions::RTrimString',
'_ZN6impala15StringFunctions11TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+ [['utf8_trim'], 'STRING', ['STRING'], 'impala::StringFunctions::Trim',
+
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+ [['utf8_ltrim'], 'STRING', ['STRING'], 'impala::StringFunctions::Ltrim',
+
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+ [['utf8_rtrim'], 'STRING', ['STRING'], 'impala::StringFunctions::Rtrim',
+
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+ [['utf8_ltrim'], 'STRING', ['STRING', 'STRING'],
'impala::StringFunctions::LTrimString',
+
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+ [['utf8_rtrim'], 'STRING', ['STRING', 'STRING'],
'impala::StringFunctions::RTrimString',
+
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
[['ascii'], 'INT', ['STRING'], 'impala::StringFunctions::Ascii'],
[['instr'], 'INT', ['STRING', 'STRING'], 'impala::StringFunctions::Instr'],
[['instr'], 'INT', ['STRING', 'STRING', 'BIGINT'],
'impala::StringFunctions::Instr'],
@@ -603,6 +618,12 @@ visible_functions = [
[['btrim'], 'STRING', ['STRING', 'STRING'],
'impala::StringFunctions::BTrimString',
'_ZN6impala15StringFunctions11TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+ [['utf8_btrim'], 'STRING', ['STRING'], 'impala::StringFunctions::Trim',
+
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
+ [['utf8_btrim'], 'STRING', ['STRING', 'STRING'],
'impala::StringFunctions::BTrimString',
+
'_ZN6impala15StringFunctions15Utf8TrimPrepareEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE',
+
'_ZN6impala15StringFunctions9TrimCloseEPN10impala_udf15FunctionContextENS2_18FunctionStateScopeE'],
[['get_json_object'], 'STRING', ['STRING', 'STRING'],
'impala::StringFunctions::GetJsonObject'],
[['levenshtein', 'le_dst'], 'INT', ['STRING', 'STRING'],
diff --git
a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
index 9417e5ad4..7a4690113 100644
---
a/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
+++
b/testdata/workloads/functional-query/queries/QueryTest/utf8-string-functions.test
@@ -310,3 +310,88 @@ select id, upper(name), lower(name), initcap(name) from
utf8_str_tiny;
---- TYPES
INT,STRING,STRING,STRING
====
+---- QUERY
+set utf8_mode=true;
+select id, name, ltrim(name, substr(name, 1, 1)) from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','三'
+2,'李四','四'
+3,'王五','五'
+4,'李小龙','小龙'
+5,'Alice','lice'
+6,'陈Bob','Bob'
+7,'Бopиc','opиc'
+8,'Jörg','örg'
+9,'ひなた','なた'
+10,'서연','연'
+---- TYPES
+INT,STRING,STRING
+====
+---- QUERY
+set utf8_mode=true;
+select id, name, rtrim(name, substr(name, 2)) from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','张'
+2,'李四','李'
+3,'王五','王'
+4,'李小龙','李'
+5,'Alice','A'
+6,'陈Bob','陈'
+7,'Бopиc','Б'
+8,'Jörg','J'
+9,'ひなた','ひ'
+10,'서연','서'
+---- TYPES
+INT,STRING,STRING
+====
+---- QUERY
+set utf8_mode=true;
+select id, name, ltrim(name, '张李王小A陈БJひ서') from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','三'
+2,'李四','四'
+3,'王五','五'
+4,'李小龙','龙'
+5,'Alice','lice'
+6,'陈Bob','Bob'
+7,'Бopиc','opиc'
+8,'Jörg','örg'
+9,'ひなた','なた'
+10,'서연','연'
+---- TYPES
+INT,STRING,STRING
+====
+---- QUERY
+set utf8_mode=true;
+select id, name, rtrim(name, '三四五小龙') from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','张'
+2,'李四','李'
+3,'王五','王'
+4,'李小龙','李'
+5,'Alice','Alice'
+6,'陈Bob','陈Bob'
+7,'Бopиc','Бopиc'
+8,'Jörg','Jörg'
+9,'ひなた','ひなた'
+10,'서연','서연'
+---- TYPES
+INT,STRING,STRING
+====
+---- QUERY
+set utf8_mode=true;
+select id, name, btrim(name, '!?。,:;‘’“”≠≥≤∞ε∑∫√') from utf8_str_tiny;
+---- RESULTS: RAW_STRING
+1,'张三','张三'
+2,'李四','李四'
+3,'王五','王五'
+4,'李小龙','李小龙'
+5,'Alice','Alice'
+6,'陈Bob','陈Bob'
+7,'Бopиc','Бopиc'
+8,'Jörg','Jörg'
+9,'ひなた','ひなた'
+10,'서연','서연'
+---- TYPES
+INT,STRING,STRING
+====