xiaokang commented on code in PR #245:
URL: https://github.com/apache/doris-thirdparty/pull/245#discussion_r1827420549


##########
src/core/CLucene/index/FieldConfig.h:
##########
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <cstdint>
+
+enum class FlagBits : uint32_t {
+    DICT_COMPRESS = 1 << 0, // 00000000 00000000 00000000 00000001
+};
+
+static bool isFlagSet(uint32_t flags, FlagBits flag) {
+    return flags & static_cast<uint32_t>(flag);

Review Comment:
   Why static_cast since flags is already uint32_t?



##########
src/core/CLucene/index/IndexWriter.h:
##########
@@ -291,7 +293,7 @@ class CLUCENE_EXPORT IndexWriter:LUCENE_BASE {
   // IndexOutput to the new Prox File
   std::vector<CL_NS(store)::IndexOutput*> proxOutputList;
   std::vector<int64_t> proxPointers;
-  std::vector<TermInfosWriter*> termInfosWriterList;
+  std::vector<STermInfosWriter<char>*> termInfosWriterList;

Review Comment:
   Why change it?



##########
src/core/CLucene/index/FieldInfos.cpp:
##########
@@ -114,6 +118,17 @@ IndexVersion FieldInfos::getIndexVersion() {
        return IndexVersion::kV0;
 }
 
+uint32_t FieldInfos::getFlags() {
+       size_t numFields = byNumber.size();
+       for (int i = 0; i < numFields; i++) {
+               FieldInfo* fi = fieldInfo(i);
+               if (fi->getFlags() > 0) {
+                       return fi->getFlags();

Review Comment:
   Is it right to just return the first non-zero flag?



##########
src/core/CLucene/index/TermInfosWriter.cpp:
##########
@@ -122,19 +134,32 @@ void STermInfosWriter<T>::add(int32_t fieldNumber, const 
T *termText, int32_t te
     CND_PRECONDITION(ti->proxPointer >= lastTi->proxPointer, ("proxPointer out 
of order (" + Misc::toString(ti->proxPointer) + " < " + 
Misc::toString(lastTi->proxPointer) + ")").c_str());
 
     if (!isIndex && size % indexInterval == 0) {
+        if (isDictCompress_) {
+            tisMemoryOutput_.writeCompressedTo(output);
+        }
         //add an index term
         other->add(lastFieldNumber, lastTermText.values, lastTermTextLength, 
lastTi);// add an index term
     }
 
-    //write term
-    writeTerm(fieldNumber, termText, termTextLength);
-    // write doc freq
-    output->writeVInt(ti->docFreq);
-    //write pointers
-    output->writeVLong(ti->freqPointer - lastTi->freqPointer);
-    output->writeVLong(ti->proxPointer - lastTi->proxPointer);
-    if (ti->docFreq >= skipInterval) {
-        output->writeVInt(ti->skipOffset);
+    if (isDictCompress_ && !isIndex) {

Review Comment:
   add comment for different branches.



##########
src/core/CLucene/index/FieldInfos.cpp:
##########
@@ -257,6 +274,9 @@ void FieldInfos::write(IndexOutput* output) const{
                if (fi->getIndexVersion() > IndexVersion::kV0) {
                        
output->writeVInt(static_cast<int32_t>(fi->getIndexVersion()));
                }
+               if (fi->getIndexVersion() >= IndexVersion::kV3) {

Review Comment:
   add test for index version compatibility.



##########
src/core/CLucene/index/IndexWriter.cpp:
##########
@@ -1879,7 +1884,8 @@ void IndexWriter::mergeTerms(bool hasProx, IndexVersion 
indexVersion) {
             TermInfo termInfo;
             termInfo.set(dfs[i], freqPointer, proxPointer, (int32_t) 
(skipPointer - freqPointer));
             // Write a new TermInfo
-            termInfosWriter->add(smallestTerm, &termInfo);
+            std::string cur_term = 
lucene_wcstoutf8string(smallestTerm->text(), smallestTerm->textLength());
+            termInfosWriter->add(smallestTerm->field(), cur_term.data(), 
cur_term.length(), &termInfo);

Review Comment:
   What's the effect of the change here and why change it?



##########
src/core/CLucene/store/v2/GrowableByteArrayDataOutput.h:
##########
@@ -0,0 +1,108 @@
+#pragma once
+
+#include <cstdint>
+#include <string_view>
+#include <vector>
+#include <zstd.h>
+#include <iostream>
+
+#include "CLucene.h"
+#include "CLucene/store/IndexOutput.h"
+
+namespace v2 {

Review Comment:
   The namespace name v2 is too simple and not precise.



##########
src/core/CLucene/document/Field.h:
##########
@@ -315,6 +316,24 @@ class CLUCENE_EXPORT Field : public 
CL_NS(util)::NamedObject{
        void setIndexVersion(IndexVersion indexVersion) { indexVersion_ = 
indexVersion; }
        IndexVersion getIndexVersion() { return indexVersion_; }
 
+       void updateFlag(FlagBits flag) {
+               if (indexVersion_ < IndexVersion::kV3) {
+                       return;
+               }
+               flags_ |= static_cast<uint32_t>(flag);
+  }

Review Comment:
   use consistent indent char: tab



##########
src/core/CLucene/index/_SegmentTermEnum.h:
##########
@@ -67,6 +68,7 @@ class SegmentTermEnum:public TermEnum{
         * Moves the current of the set to the next in the set
         */
        bool next();
+       bool _next(CL_NS(store)::IndexInput* in);

Review Comment:
   make it private.



##########
src/core/CLucene/index/IndexWriter.cpp:
##########
@@ -1879,7 +1884,8 @@ void IndexWriter::mergeTerms(bool hasProx, IndexVersion 
indexVersion) {
             TermInfo termInfo;
             termInfo.set(dfs[i], freqPointer, proxPointer, (int32_t) 
(skipPointer - freqPointer));
             // Write a new TermInfo
-            termInfosWriter->add(smallestTerm, &termInfo);
+            std::string cur_term = 
lucene_wcstoutf8string(smallestTerm->text(), smallestTerm->textLength());
+            termInfosWriter->add(smallestTerm->field(), cur_term.data(), 
cur_term.length(), &termInfo);

Review Comment:
   I dig into your newly added function add() but can not find the difference.



##########
src/core/CLucene/index/IndexVersion.h:
##########
@@ -1,9 +1,10 @@
 #pragma once
 
 enum class IndexVersion {
-  kV0 = 0,
-  kV1 = 1,
-  kV2 = 2,
+    kV0 = 0,
+    kV1 = 1,
+    kV2 = 2,
+    kV3 = 3,

Review Comment:
   add comment for each version



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: dev-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: dev-unsubscr...@doris.apache.org
For additional commands, e-mail: dev-h...@doris.apache.org

Reply via email to