This is an automated email from the ASF dual-hosted git repository. adonisling pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new b4e4f8f [enhancement](tokenizer) Accelerate ascii tokenizer speed by SIMD to_lower function (#50) b4e4f8f is described below commit b4e4f8f4b8f3407057fa7061a83dbfa16071642b Author: airborne12 <airborn...@gmail.com> AuthorDate: Mon Apr 17 15:41:32 2023 +0800 [enhancement](tokenizer) Accelerate ascii tokenizer speed by SIMD to_lower function (#50) --- src/core/CLucene/analysis/Analyzers.cpp | 46 ++++++++++++++++++++++++++++ src/core/CLucene/analysis/Analyzers.h | 6 ++++ src/core/CLucene/util/stringUtil.cpp | 1 + src/core/CLucene/util/stringUtil.h | 53 ++++++++++++++++++++++++++++++++- src/test/analysis/TestAnalysis.cpp | 2 ++ 5 files changed, 107 insertions(+), 1 deletion(-) diff --git a/src/core/CLucene/analysis/Analyzers.cpp b/src/core/CLucene/analysis/Analyzers.cpp index 6ca4183..c5c9b6a 100644 --- a/src/core/CLucene/analysis/Analyzers.cpp +++ b/src/core/CLucene/analysis/Analyzers.cpp @@ -13,6 +13,52 @@ CL_NS_USE(util) CL_NS_DEF(analysis) +template<> +void CharTokenizer<char>::normalize(const char *src, int64_t len, char *dst) { + to_lower((const uint8_t *) src, len, (uint8_t *) dst); +} + +template<> +Token *CharTokenizer<char>::next(Token *token) { + int32_t length = 0; + int32_t start = offset; + while (true) { + char c; + offset++; + if (bufferIndex >= dataLen) { + dataLen = input->read((const void **) &ioBuffer, 1, LUCENE_IO_BUFFER_SIZE); + if (dataLen == -1) + dataLen = 0; + bufferIndex = 0; + } + if (dataLen <= 0) { + if (length > 0) + break; + else + return NULL; + } else + c = ioBuffer[bufferIndex++]; + if (isTokenChar(c)) {// if it's a token TCHAR + + if (length == 0)// start of token + start = offset - 1; + + //buffer[length++] = normalize(c); // buffer it, normalized + buffer[length++] = c; + if (length == LUCENE_MAX_WORD_LEN)// buffer overflow! + break; + + } else if (length > 0)// at non-Letter w/ chars + break; // return 'em + } + char buffer_copy[LUCENE_MAX_WORD_LEN + 1]; + normalize(buffer, length, buffer_copy); + buffer_copy[length] = 0; + token->set(buffer_copy, start, start + length); + + return token; +}; + template<typename T> LetterTokenizer<T>::LetterTokenizer(CL_NS(util)::Reader* in): CharTokenizer<T>(in) { diff --git a/src/core/CLucene/analysis/Analyzers.h b/src/core/CLucene/analysis/Analyzers.h index fc65204..6ab819e 100644 --- a/src/core/CLucene/analysis/Analyzers.h +++ b/src/core/CLucene/analysis/Analyzers.h @@ -10,6 +10,7 @@ #include "CLucene/util/VoidList.h" #include "CLucene/util/VoidMap.h" #include "CLucene/util/CLStreams.h" +#include "CLucene/util/stringUtil.h" #include "AnalysisHeader.h" CL_NS_DEF(analysis) @@ -33,6 +34,11 @@ protected: * to, e.g., lowercase tokens. */ virtual T normalize(const T c) const{return c;}; + virtual void normalize(const T *src, int64_t len, T *dst) { + for (; src < src + len; ++src, ++dst) + *dst = normalize(*src); + }; + public: explicit CharTokenizer(CL_NS(util)::Reader* in):Tokenizer(in), offset(0), diff --git a/src/core/CLucene/util/stringUtil.cpp b/src/core/CLucene/util/stringUtil.cpp index 68437f5..07aa155 100644 --- a/src/core/CLucene/util/stringUtil.cpp +++ b/src/core/CLucene/util/stringUtil.cpp @@ -1,6 +1,7 @@ // // Created by 姜凯 on 2022/9/20. // +#include "CLucene/_ApiHeader.h" #include "stringUtil.h" template <> diff --git a/src/core/CLucene/util/stringUtil.h b/src/core/CLucene/util/stringUtil.h index eeddce7..f6a4958 100644 --- a/src/core/CLucene/util/stringUtil.h +++ b/src/core/CLucene/util/stringUtil.h @@ -5,7 +5,11 @@ #ifndef _lucene_util__stringutil_H #define _lucene_util__stringutil_H -#include "CLucene/_ApiHeader.h" +#ifdef __SSE2__ +#include <emmintrin.h> +#elif __aarch64__ +#include <sse2neon.h> +#endif template <typename T> const T* LUCENE_BLANK_SSTRING(); @@ -24,4 +28,51 @@ T *strDuplicate(const T *str); template<typename T> size_t lenOfString(const T *str); + +template <char not_case_lower_bound, char not_case_upper_bound> +class LowerUpperImpl { +public: + static void transfer(const uint8_t* src, const uint8_t* src_end, uint8_t* dst) { + const auto flip_case_mask = 'A' ^ 'a'; + +#if defined(__SSE2__) || defined(__aarch64__) + const auto bytes_sse = sizeof(__m128i); + const auto src_end_sse = src_end - (src_end - src) % bytes_sse; + + const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1); + const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1); + const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask); + + for (; src < src_end_sse; src += bytes_sse, dst += bytes_sse) { + const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); + const auto is_not_case = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound), + _mm_cmplt_epi8(chars, v_not_case_upper_bound)); + const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case); + const auto cased_chars = _mm_xor_si128(chars, xor_mask); + _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), cased_chars); + } +#endif + + for (; src < src_end; ++src, ++dst) + if (*src >= not_case_lower_bound && *src <= not_case_upper_bound) + *dst = *src ^ flip_case_mask; + else + *dst = *src; + } +}; + +static void to_lower(const uint8_t* src, int64_t len, uint8_t* dst) { + if (len <= 0) { + return; + } + LowerUpperImpl<'A', 'Z'>::transfer(src, src + len, dst); +} + +static void to_upper(const uint8_t* src, int64_t len, uint8_t* dst) { + if (len <= 0) { + return; + } + LowerUpperImpl<'a', 'z'> lowerUpper; + LowerUpperImpl<'a', 'z'>::transfer(src, src + len, dst); +} #endif//_lucene_util__stringutil_H diff --git a/src/test/analysis/TestAnalysis.cpp b/src/test/analysis/TestAnalysis.cpp index 73e4337..da51aa0 100644 --- a/src/test/analysis/TestAnalysis.cpp +++ b/src/test/analysis/TestAnalysis.cpp @@ -78,6 +78,7 @@ void testTokenStreamField(CuTest *tc) { void testChar(CuTest *tc) { const char *text = "This is a test 123_test"; + std::vector<string> result{"this","is","a","test","123","test"}; SStringReader<char> reader(text, strlen(text)); SimpleAnalyzer<char> analyzer; TokenStream *stream = analyzer.tokenStream(NULL, &reader); @@ -85,6 +86,7 @@ void testChar(CuTest *tc) { int32_t count = 0; CL_NS(analysis)::Token t; while (stream->next(&t) != NULL) { + assertEquals(true, strCompare(t.termBuffer<char>(), result.at(count).c_str()) == 0); count++; } //printf("count = %d\n", count); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org