This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new a24fa95a [Fix] fix compile and unitest problems (#100) a24fa95a is described below commit a24fa95aa8935c980e3040dda8f948bf3a1b73a3 Author: airborne12 <airborn...@gmail.com> AuthorDate: Wed Jul 12 15:58:48 2023 +0800 [Fix] fix compile and unitest problems (#100) 1. fix CMake when build clucene test alone. 2. revise and add more chinese unitest. --- src/contribs-lib/CMakeLists.txt | 12 +++---- src/test/contribs-lib/analysis/testChinese.cpp | 48 ++++++++++++++++++++------ 2 files changed, 44 insertions(+), 16 deletions(-) diff --git a/src/contribs-lib/CMakeLists.txt b/src/contribs-lib/CMakeLists.txt index df959fee..afc752e8 100644 --- a/src/contribs-lib/CMakeLists.txt +++ b/src/contribs-lib/CMakeLists.txt @@ -88,12 +88,12 @@ ENDIF() file(GLOB_RECURSE HEADERS ${clucene-contribs-lib_SOURCE_DIR}/*.h) #add extra capabilities -find_package(ZLIB) -IF ( NOT ZLIB_FOUND ) - MESSAGE ( FATAL "ZLib not found" ) -ENDIF ( NOT ZLIB_FOUND ) -INCLUDE_DIRECTORIES( ${ZLIB_INCLUDE_DIR} ) -SET ( clucene_contrib_extra_libs "${clucene_contrib_extra_libs}" ${ZLIB_LIBRARIES} ) +#find_package(ZLIB) +#IF ( NOT ZLIB_FOUND ) +# MESSAGE ( FATAL "ZLib not found" ) +#ENDIF ( NOT ZLIB_FOUND ) +#INCLUDE_DIRECTORIES( ${ZLIB_INCLUDE_DIR} ) +#SET ( clucene_contrib_extra_libs "${clucene_contrib_extra_libs}" ${ZLIB_LIBRARIES} ) find_package(Iconv) #find_package(Strigi) diff --git a/src/test/contribs-lib/analysis/testChinese.cpp b/src/test/contribs-lib/analysis/testChinese.cpp index 95f0c24b..7e47aa2d 100644 --- a/src/test/contribs-lib/analysis/testChinese.cpp +++ b/src/test/contribs-lib/analysis/testChinese.cpp @@ -212,8 +212,6 @@ void testSimpleJiebaAllModeTokenizer(CuTest* tc) { a.initDict(get_dict_path()); ts = a.tokenStream(_T("contents"), stringReader); - CLUCENE_ASSERT(ts->next(&t) != NULL); - CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0); CLUCENE_ASSERT(ts->next(&t) != NULL); CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0); CLUCENE_ASSERT(ts->next(&t) != NULL); @@ -230,6 +228,43 @@ void testSimpleJiebaAllModeTokenizer(CuTest* tc) { _CLDELETE(ts); } +void testSimpleJiebaDefaultModeTokenizer2(CuTest* tc) { + LanguageBasedAnalyzer a; + const char* field_value_data = "中国的科技发展在世界上处于领先"; + auto stringReader = + _CLNEW lucene::util::SStringReader<char>(field_value_data, strlen(field_value_data), false); + TokenStream* ts; + Token t; + + //test with chinese + a.setLanguage(_T("chinese")); + a.setStem(false); + a.setMode(lucene::analysis::AnalyzerMode::Default); + a.initDict(get_dict_path()); + ts = a.tokenStream(_T("contents"), stringReader); + + /*char tmp[255] = {}; + while(ts->next(&t) != nullptr) { + lucene_wcstoutf8(tmp, t.termBuffer<TCHAR>(), 254); + std::cout << tmp << std::endl; + }*/ + + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("中国")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("科技")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("发展")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("在世界上")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("处于")) == 0); + CLUCENE_ASSERT(ts->next(&t) != NULL); + CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("领先")) == 0); + CLUCENE_ASSERT(ts->next(&t) == NULL); + _CLDELETE(ts); +} + void testSimpleJiebaDefaultModeTokenizer(CuTest* tc) { LanguageBasedAnalyzer a; const char* field_value_data = "我来到北京清华大学"; @@ -245,8 +280,6 @@ void testSimpleJiebaDefaultModeTokenizer(CuTest* tc) { a.initDict(get_dict_path()); ts = a.tokenStream(_T("contents"), stringReader); - CLUCENE_ASSERT(ts->next(&t) != NULL); - CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("我")) == 0); CLUCENE_ASSERT(ts->next(&t) != NULL); CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("来到")) == 0); CLUCENE_ASSERT(ts->next(&t) != NULL); @@ -330,14 +363,10 @@ void testSimpleJiebaTokenizer2(CuTest* tc) { CLUCENE_ASSERT(ts->next(&t) != NULL); CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("人民")) == 0); CLUCENE_ASSERT(ts->next(&t) != NULL); - CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("可以")) == 0); - CLUCENE_ASSERT(ts->next(&t) != NULL); CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("得到")) == 0); CLUCENE_ASSERT(ts->next(&t) != NULL); CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("更")) == 0); CLUCENE_ASSERT(ts->next(&t) != NULL); - CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("多")) == 0); - CLUCENE_ASSERT(ts->next(&t) != NULL); CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("实惠")) == 0); CLUCENE_ASSERT(ts->next(&t) == NULL); _CLDELETE(ts); @@ -380,8 +409,6 @@ void testSimpleJiebaTokenizer4(CuTest* tc) { CLUCENE_ASSERT(ts->next(&t) != NULL); CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("人民")) == 0); CLUCENE_ASSERT(ts->next(&t) != NULL); - CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T(",")) == 0); - CLUCENE_ASSERT(ts->next(&t) != NULL); CLUCENE_ASSERT(_tcscmp(t.termBuffer<TCHAR>(), _T("银行")) == 0); CLUCENE_ASSERT(ts->next(&t) == NULL); _CLDELETE(ts); @@ -1280,6 +1307,7 @@ CuSuite *testchinese(void) { SUITE_ADD_TEST(suite, testJiebaMatchHuge); SUITE_ADD_TEST(suite, testSimpleJiebaAllModeTokenizer); SUITE_ADD_TEST(suite, testSimpleJiebaDefaultModeTokenizer); + SUITE_ADD_TEST(suite, testSimpleJiebaDefaultModeTokenizer2); SUITE_ADD_TEST(suite, testSimpleJiebaSearchModeTokenizer); SUITE_ADD_TEST(suite, testSimpleJiebaAllModeTokenizer2); SUITE_ADD_TEST(suite, testSimpleJiebaSearchModeTokenizer2); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org