yiguolei commented on code in PR #64082:
URL: https://github.com/apache/doris/pull/64082#discussion_r3360231777


##########
be/src/storage/index/ann/ann_index_writer.cpp:
##########
@@ -146,54 +129,56 @@ int64_t AnnIndexColumnWriter::size() const {
 }
 
 Status AnnIndexColumnWriter::finish() {
-    Int64 min_train_rows = _vector_index->get_min_train_rows();
-
-    // Check if we have enough rows to train the index
-    // train/add the remaining data
-    if (_float_array.empty()) {
-        if (_need_save_index) {
-            return _vector_index->save(_dir.get());
-        } else {
-            // No data was added at all. This can happen if the segment has 0 
rows
-            // or all rows were filtered out. We need to delete the directory 
entry
-            // to avoid writing an empty/invalid index file.
-            LOG_INFO("No data to train/add for ANN index. Skipping index 
building.");
-            return _index_file_writer->delete_index(_index_meta);
-        }
-    } else {
-        DCHECK(_float_array.size() % _vector_index->get_dimension() == 0);
-
-        Int64 num_rows = _float_array.size() / _vector_index->get_dimension();
-
-        if (num_rows >= min_train_rows) {
-            RETURN_IF_ERROR(_vector_index->train(num_rows, 
_float_array.data()));
-            RETURN_IF_ERROR(_vector_index->add(num_rows, _float_array.data()));
-            _float_array.clear();
-            return _vector_index->save(_dir.get());
-        } else {
-            // It happens to have not enough data to train.
-            // If we have data to add before, we still need to save the index.
-            if (_need_save_index) {
-                // For IVF indexes, adding remaining vectors without training 
is acceptable
-                // because the quantizer was already trained on previous 
batches. These vectors
-                // are simply added to the nearest clusters without retraining.
-                RETURN_IF_ERROR(_vector_index->add(num_rows, 
_float_array.data()));
-                _float_array.clear();
-                return _vector_index->save(_dir.get());
-            } else {
-                // Not enough data to train and no data added before.
-                // Means this is a very small segment, we can skip the index 
building.
-                // We need to delete the directory entry from 
index_file_writer to avoid
-                // writing an empty/invalid index file which causes 
"IndexInput read past EOF" error.
-                LOG_INFO(
-                        "Remaining data size {} is less than minimum {} rows 
required for ANN "
-                        "index "
-                        "training. Skipping index building for this segment.",
-                        num_rows, min_train_rows);
-                _float_array.clear();
-                return _index_file_writer->delete_index(_index_meta);
-            }
-        }
+    if (_total_rows == 0) {
+        LOG_INFO("No data to train/add for ANN index. Skipping index 
building.");
+        return _index_file_writer->delete_index(_index_meta);
+    }
+
+    const Int64 min_train_rows = _vector_index->get_min_train_rows();
+    const Int64 effective_min_rows = _effective_min_rows(min_train_rows);
+    if (_total_rows < effective_min_rows) {
+        LOG_INFO(
+                "Total data size {} is less than minimum {} rows required for 
ANN index build. "
+                "Skipping index building for this segment.",
+                _total_rows, effective_min_rows);
+        _release_buffered_vectors();
+        return _index_file_writer->delete_index(_index_meta);
+    }
+
+    return _build_and_save(min_train_rows, effective_min_rows);
+}
+
+Int64 AnnIndexColumnWriter::_effective_min_rows(Int64 min_train_rows) const {
+    return std::max(min_train_rows, cast_set<Int64>(_min_segment_rows));
+}
+
+Status AnnIndexColumnWriter::_append_vectors_to_buffer(const float* vectors, 
size_t num_rows) {
+    DCHECK(vectors != nullptr);

Review Comment:
   之前他有一个验证dim 的长度的问题,你这里没了?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to