Copilot commented on code in PR #64216:
URL: https://github.com/apache/doris/pull/64216#discussion_r3371813244
##########
be/src/storage/index/ann/ann_index_writer.h:
##########
@@ -75,17 +67,21 @@ class AnnIndexColumnWriter : public IndexColumnWriter {
Status finish() override;
private:
+ Status _build_and_save(Int64 min_train_rows, Int64 effective_min_rows);
+
+#ifdef BE_TEST
+ friend class TestAnnIndexColumnWriter;
+#endif
+
// VectorIndex shoule be managed by some cache.
Review Comment:
Spelling: "shoule" -> "should".
##########
be/src/storage/index/ann/ann_index_writer.cpp:
##########
@@ -110,26 +112,10 @@ Status AnnIndexColumnWriter::add_array_values(size_t
field_size, const void* val
const float* p = reinterpret_cast<const float*>(value_ptr);
- const size_t full_elements = AnnIndexColumnWriter::chunk_size() * dim;
- size_t remaining_elements = num_rows * dim;
- size_t src_offset = 0;
- while (remaining_elements > 0) {
- size_t available_space = full_elements - _float_array.size();
- size_t elements_to_add = std::min(remaining_elements, available_space);
-
- _float_array.insert(_float_array.end(), p + src_offset, p + src_offset
+ elements_to_add);
- src_offset += elements_to_add;
- remaining_elements -= elements_to_add;
-
- if (_float_array.size() == full_elements) {
- RETURN_IF_ERROR(
- _vector_index->train(AnnIndexColumnWriter::chunk_size(),
_float_array.data()));
- RETURN_IF_ERROR(
- _vector_index->add(AnnIndexColumnWriter::chunk_size(),
_float_array.data()));
- _float_array.clear();
- _need_save_index = true;
- }
- }
+ // The offsets check above guarantees every array row matches the ANN
index dimension.
+ DCHECK(p != nullptr);
+ _buffered_vectors.insert(_buffered_vectors.end(), p, p + num_rows * dim);
+ _total_rows += cast_set<int64_t>(num_rows);
Review Comment:
`add_array_values()` now buffers *all* vectors into `_buffered_vectors`
regardless of index type, and only calls `VectorIndex::add()` in `finish()`.
For non-train indexes (`get_min_train_rows() == 0`, e.g. HNSW/FLAT), this
changes peak memory from being bounded by incremental ingestion to being
proportional to full segment size, which can be a significant operational
regression on large segments. Consider streaming `add()` for non-train indexes
(or buffering only until `effective_min_rows` is reached, then switching to
streaming) so large segments don't require a full in-memory copy of all vectors.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]