[doris-thirdparty] branch clucene updated: [enhancement](tokenizer) Accelerate ascii tokenizer speed by SIMD to_lower function (#50)

adonisling Mon, 17 Apr 2023 00:42:22 -0700

This is an automated email from the ASF dual-hosted git repository.

adonisling pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git



The following commit(s) were added to refs/heads/clucene by this push:
     new b4e4f8f  [enhancement](tokenizer) Accelerate ascii tokenizer speed by 
SIMD to_lower function (#50)
b4e4f8f is described below

commit b4e4f8f4b8f3407057fa7061a83dbfa16071642b
Author: airborne12 <airborn...@gmail.com>
AuthorDate: Mon Apr 17 15:41:32 2023 +0800

    [enhancement](tokenizer) Accelerate ascii tokenizer speed by SIMD to_lower 
function (#50)
---
 src/core/CLucene/analysis/Analyzers.cpp | 46 ++++++++++++++++++++++++++++
 src/core/CLucene/analysis/Analyzers.h   |  6 ++++
 src/core/CLucene/util/stringUtil.cpp    |  1 +
 src/core/CLucene/util/stringUtil.h      | 53 ++++++++++++++++++++++++++++++++-
 src/test/analysis/TestAnalysis.cpp      |  2 ++
 5 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/src/core/CLucene/analysis/Analyzers.cpp 
b/src/core/CLucene/analysis/Analyzers.cpp
index 6ca4183..c5c9b6a 100644
--- a/src/core/CLucene/analysis/Analyzers.cpp
+++ b/src/core/CLucene/analysis/Analyzers.cpp
@@ -13,6 +13,52 @@
 CL_NS_USE(util)
 CL_NS_DEF(analysis)
 
+template<>
+void CharTokenizer<char>::normalize(const char *src, int64_t len, char *dst) {
+    to_lower((const uint8_t *) src, len, (uint8_t *) dst);
+}
+
+template<>
+Token *CharTokenizer<char>::next(Token *token) {
+    int32_t length = 0;
+    int32_t start = offset;
+    while (true) {
+        char c;
+        offset++;
+        if (bufferIndex >= dataLen) {
+            dataLen = input->read((const void **) &ioBuffer, 1, 
LUCENE_IO_BUFFER_SIZE);
+            if (dataLen == -1)
+                dataLen = 0;
+            bufferIndex = 0;
+        }
+        if (dataLen <= 0) {
+            if (length > 0)
+                break;
+            else
+                return NULL;
+        } else
+            c = ioBuffer[bufferIndex++];
+        if (isTokenChar(c)) {// if it's a token TCHAR
+
+            if (length == 0)// start of token
+                start = offset - 1;
+
+            //buffer[length++] = normalize(c);          // buffer it, 
normalized
+            buffer[length++] = c;
+            if (length == LUCENE_MAX_WORD_LEN)// buffer overflow!
+                break;
+
+        } else if (length > 0)// at non-Letter w/ chars
+            break;            // return 'em
+    }
+    char buffer_copy[LUCENE_MAX_WORD_LEN + 1];
+    normalize(buffer, length, buffer_copy);
+    buffer_copy[length] = 0;
+    token->set(buffer_copy, start, start + length);
+
+    return token;
+};
+
 template<typename T>
 LetterTokenizer<T>::LetterTokenizer(CL_NS(util)::Reader* in):
     CharTokenizer<T>(in) {
diff --git a/src/core/CLucene/analysis/Analyzers.h 
b/src/core/CLucene/analysis/Analyzers.h
index fc65204..6ab819e 100644
--- a/src/core/CLucene/analysis/Analyzers.h
+++ b/src/core/CLucene/analysis/Analyzers.h
@@ -10,6 +10,7 @@
 #include "CLucene/util/VoidList.h"
 #include "CLucene/util/VoidMap.h"
 #include "CLucene/util/CLStreams.h"
+#include "CLucene/util/stringUtil.h"
 #include "AnalysisHeader.h"
 
 CL_NS_DEF(analysis)
@@ -33,6 +34,11 @@ protected:
     * to, e.g., lowercase tokens. */
     virtual T normalize(const T c) const{return c;};
 
+    virtual void normalize(const T *src, int64_t len, T *dst) {
+        for (; src < src + len; ++src, ++dst)
+            *dst = normalize(*src);
+    };
+
 public:
     explicit CharTokenizer(CL_NS(util)::Reader* in):Tokenizer(in),
                                                offset(0),
diff --git a/src/core/CLucene/util/stringUtil.cpp 
b/src/core/CLucene/util/stringUtil.cpp
index 68437f5..07aa155 100644
--- a/src/core/CLucene/util/stringUtil.cpp
+++ b/src/core/CLucene/util/stringUtil.cpp
@@ -1,6 +1,7 @@
 //
 // Created by 姜凯 on 2022/9/20.
 //
+#include "CLucene/_ApiHeader.h"
 #include "stringUtil.h"
 
 template <>
diff --git a/src/core/CLucene/util/stringUtil.h 
b/src/core/CLucene/util/stringUtil.h
index eeddce7..f6a4958 100644
--- a/src/core/CLucene/util/stringUtil.h
+++ b/src/core/CLucene/util/stringUtil.h
@@ -5,7 +5,11 @@
 #ifndef _lucene_util__stringutil_H
 #define _lucene_util__stringutil_H
 
-#include "CLucene/_ApiHeader.h"
+#ifdef __SSE2__
+#include <emmintrin.h>
+#elif __aarch64__
+#include <sse2neon.h>
+#endif
 
 template <typename T>
 const T* LUCENE_BLANK_SSTRING();
@@ -24,4 +28,51 @@ T *strDuplicate(const T *str);
 
 template<typename T>
 size_t lenOfString(const T *str);
+
+template <char not_case_lower_bound, char not_case_upper_bound>
+class LowerUpperImpl {
+public:
+    static void transfer(const uint8_t* src, const uint8_t* src_end, uint8_t* 
dst) {
+        const auto flip_case_mask = 'A' ^ 'a';
+
+#if defined(__SSE2__) || defined(__aarch64__)
+        const auto bytes_sse = sizeof(__m128i);
+        const auto src_end_sse = src_end - (src_end - src) % bytes_sse;
+
+        const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound 
- 1);
+        const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound 
+ 1);
+        const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask);
+
+        for (; src < src_end_sse; src += bytes_sse, dst += bytes_sse) {
+            const auto chars = _mm_loadu_si128(reinterpret_cast<const 
__m128i*>(src));
+            const auto is_not_case = _mm_and_si128(_mm_cmpgt_epi8(chars, 
v_not_case_lower_bound),
+                                                   _mm_cmplt_epi8(chars, 
v_not_case_upper_bound));
+            const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case);
+            const auto cased_chars = _mm_xor_si128(chars, xor_mask);
+            _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), cased_chars);
+        }
+#endif
+
+        for (; src < src_end; ++src, ++dst)
+            if (*src >= not_case_lower_bound && *src <= not_case_upper_bound)
+                *dst = *src ^ flip_case_mask;
+            else
+                *dst = *src;
+    }
+};
+
+static void to_lower(const uint8_t* src, int64_t len, uint8_t* dst) {
+    if (len <= 0) {
+        return;
+    }
+    LowerUpperImpl<'A', 'Z'>::transfer(src, src + len, dst);
+}
+
+static void to_upper(const uint8_t* src, int64_t len, uint8_t* dst) {
+    if (len <= 0) {
+        return;
+    }
+    LowerUpperImpl<'a', 'z'> lowerUpper;
+    LowerUpperImpl<'a', 'z'>::transfer(src, src + len, dst);
+}
 #endif//_lucene_util__stringutil_H
diff --git a/src/test/analysis/TestAnalysis.cpp 
b/src/test/analysis/TestAnalysis.cpp
index 73e4337..da51aa0 100644
--- a/src/test/analysis/TestAnalysis.cpp
+++ b/src/test/analysis/TestAnalysis.cpp
@@ -78,6 +78,7 @@ void testTokenStreamField(CuTest *tc) {
 
 void testChar(CuTest *tc) {
     const char *text = "This is a test 123_test";
+    std::vector<string> result{"this","is","a","test","123","test"};
     SStringReader<char> reader(text, strlen(text));
     SimpleAnalyzer<char> analyzer;
     TokenStream *stream = analyzer.tokenStream(NULL, &reader);
@@ -85,6 +86,7 @@ void testChar(CuTest *tc) {
     int32_t count = 0;
     CL_NS(analysis)::Token t;
     while (stream->next(&t) != NULL) {
+        assertEquals(true, strCompare(t.termBuffer<char>(), 
result.at(count).c_str()) == 0);
         count++;
     }
     //printf("count = %d\n", count);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[doris-thirdparty] branch clucene updated: [enhancement](tokenizer) Accelerate ascii tokenizer speed by SIMD to_lower function (#50)

Reply via email to