zzzxl1993 commented on code in PR #245: URL: https://github.com/apache/doris-thirdparty/pull/245#discussion_r1829009376
########## src/test/index/TestIndexCompressV3.cpp: ########## @@ -0,0 +1,325 @@ +#include <CLucene.h> // IWYU pragma: keep +#include <CLucene/index/IndexReader.h> +#include <CLucene/search/query/TermPositionIterator.h> +#include <CLucene/util/stringUtil.h> + +#include <ctime> +#include <exception> +#include <stdexcept> +#include <string> +#include <vector> + +#include "CLucene/analysis/Analyzers.h" +#include "CLucene/index/FieldConfig.h" +#include "CLucene/index/IndexVersion.h" +#include "CLucene/index/Term.h" +#include "CLucene/store/FSDirectory.h" +#include "CLucene/store/_RAMDirectory.h" +#include "CLucene/store/v2/ByteArrayDataInput.h" +#include "CLucene/store/v2/GrowableByteArrayDataOutput.h" +#include "CuTest.h" +#include "test.h" + +CL_NS_USE(search) +CL_NS_USE(store) +CL_NS_USE(index) +CL_NS_USE(util) + +static constexpr int32_t doc_count = 10000; + +#define FINALLY(eptr, finallyBlock) \ + { \ + finallyBlock; \ + if (eptr) { \ + std::rethrow_exception(eptr); \ + } \ + } + +static int32_t getDaySeed() { + std::time_t now = std::time(nullptr); + std::tm* localTime = std::localtime(&now); + localTime->tm_sec = 0; + localTime->tm_min = 0; + localTime->tm_hour = 0; + return static_cast<int32_t>(std::mktime(localTime) / (60 * 60 * 24)); +} + +static std::string generateRandomIP() { + std::string ip_v4; + ip_v4.append(std::to_string(rand() % 256)); + ip_v4.append("."); + ip_v4.append(std::to_string(rand() % 256)); + ip_v4.append("."); + ip_v4.append(std::to_string(rand() % 256)); + ip_v4.append("."); + ip_v4.append(std::to_string(rand() % 256)); + return ip_v4; +} + +static void write_index(const std::string& name, RAMDirectory* dir, IndexVersion index_version, + bool isDictCompress, const std::vector<std::string>& datas) { + auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>; + analyzer->set_stopwords(nullptr); + auto* indexwriter = _CLNEW lucene::index::IndexWriter(dir, analyzer, true); + indexwriter->setRAMBufferSizeMB(512); + indexwriter->setMaxBufferedDocs(-1); + indexwriter->setMaxFieldLength(0x7FFFFFFFL); + indexwriter->setMergeFactor(1000000000); + indexwriter->setUseCompoundFile(false); + + auto* char_string_reader = _CLNEW lucene::util::SStringReader<char>; + + auto* doc = _CLNEW lucene::document::Document(); + int32_t field_config = lucene::document::Field::STORE_NO; + field_config |= lucene::document::Field::INDEX_NONORMS; + field_config |= lucene::document::Field::INDEX_TOKENIZED; + auto field_name = std::wstring(name.begin(), name.end()); + auto* field = _CLNEW lucene::document::Field(field_name.c_str(), field_config); + field->setOmitTermFreqAndPositions(false); + field->setIndexVersion(index_version); + if (isDictCompress) { + field->updateFlag(FlagBits::DICT_COMPRESS); + } + doc->add(*field); + + for (const auto& data : datas) { + char_string_reader->init(data.data(), data.size(), false); + auto* stream = analyzer->reusableTokenStream(field->name(), char_string_reader); + field->setValue(stream); + indexwriter->addDocument(doc); + } + + indexwriter->close(); + + _CLLDELETE(indexwriter); + _CLLDELETE(doc); + _CLLDELETE(analyzer); + _CLLDELETE(char_string_reader); +} + +static void read_index(RAMDirectory* dir, int32_t doc_count) { + auto* reader = IndexReader::open(dir); + + std::exception_ptr eptr; + try { + if (doc_count != reader->numDocs()) { + std::string msg = "doc_count: " + std::to_string(doc_count) + + ", numDocs: " + std::to_string(reader->numDocs()); + _CLTHROWA(CL_ERR_IllegalArgument, msg.c_str()); + } + + Term* term = nullptr; + TermEnum* enumerator = nullptr; + try { + enumerator = reader->terms(); + while (enumerator->next()) { + term = enumerator->term(); + + auto* term_pos = reader->termPositions(term); + + std::exception_ptr eptr; + try { + TermPositionIterator iter(term_pos); + int32_t doc = 0; + while ((doc = iter.nextDoc()) != INT32_MAX) { + for (int32_t i = 0; i < iter.freq(); i++) { + int32_t pos = iter.nextPosition(); + if (pos < 0 || pos > 3) { + std::string msg = "pos: " + std::to_string(pos); + _CLTHROWA(CL_ERR_IllegalArgument, msg.c_str()); + } + } + } + } catch (...) { + eptr = std::current_exception(); + } + FINALLY(eptr, { _CLDELETE(term_pos); }) + + _CLDECDELETE(term); + } + } + _CLFINALLY({ + _CLDECDELETE(term); + enumerator->close(); + _CLDELETE(enumerator); + }) + + } catch (...) { + eptr = std::current_exception(); + } + FINALLY(eptr, { + reader->close(); + _CLLDELETE(reader); + }) +} + +static void index_compaction(RAMDirectory* tmp_dir, std::vector<lucene::store::Directory*> srcDirs, + std::vector<lucene::store::Directory*> destDirs, int32_t count) { + auto* analyzer = _CLNEW lucene::analysis::SimpleAnalyzer<char>; + auto* indexwriter = _CLNEW lucene::index::IndexWriter(tmp_dir, analyzer, true); + + std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec( + srcDirs.size(), std::vector<std::pair<uint32_t, uint32_t>>(count)); + int32_t idx = 0; + int32_t id = 0; + for (int32_t i = 0; i < count; i++) { + for (int32_t j = 0; j < srcDirs.size(); j++) { + if (id == count * destDirs.size()) { + idx++; + id = 0; + } + trans_vec[j][i] = std::make_pair(idx, id++); + } + } + + std::vector<uint32_t> dest_index_docs(destDirs.size()); + for (int32_t i = 0; i < destDirs.size(); i++) { + dest_index_docs[i] = count * destDirs.size(); + } + + std::exception_ptr eptr; + try { + indexwriter->indexCompaction(srcDirs, destDirs, trans_vec, dest_index_docs); + } catch (...) { + eptr = std::current_exception(); + } + FINALLY(eptr, { + indexwriter->close(); + _CLDELETE(indexwriter); + _CLDELETE(analyzer); + }) +} + +void TestIndexByteArray(CuTest* tc) { + RAMDirectory dir; + auto ram_out = dir.createOutput("TestIndexByteArray"); + + v2::GrowableByteArrayDataOutput out; + for (int32_t i = 0; i < doc_count; i++) { + out.writeVInt(i); + } + out.writeCompressedTo(ram_out); + ram_out->close(); + + IndexInput* ram_in = nullptr; + CLuceneError error; + bool ret = dir.openInput("TestIndexByteArray", ram_in, error); + if (!ret) { + std::cout << error.what() << std::endl; + } + assertTrue(ret); + + v2::ByteArrayDataInput in; + in.readCompressedFrom(ram_in); + for (int32_t i = 0; i < doc_count; i++) { + assertEquals(in.readVInt(), i); + } + + _CLDELETE(ram_out); + _CLDELETE(ram_in); + + std::cout << "\nTestIndexByteArray sucess" << std::endl; +} + +void TestIndexCompressV3(CuTest* tc) { + std::srand(getDaySeed()); + + std::string name = "v2_field_name"; + std::vector<std::string> datas; + for (int32_t i = 0; i < doc_count; i++) { + std::string ip_v4 = generateRandomIP(); + datas.emplace_back(ip_v4); + } + + RAMDirectory dir; + write_index(name, &dir, IndexVersion::kV3, false, datas); + + try { + read_index(&dir, doc_count); Review Comment: In the read_index function, the position range is checked, and an exception is thrown if there is an error. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: dev-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@doris.apache.org For additional commands, e-mail: dev-h...@doris.apache.org