This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch vector-index-dev
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/vector-index-dev by this push:
new d6048f27a55 [fix] fix unstable test case (#51159)
d6048f27a55 is described below
commit d6048f27a556b3b5a2a164a679fed2bc7ee4536f
Author: zhiqiang <[email protected]>
AuthorDate: Thu May 22 17:44:35 2025 +0800
[fix] fix unstable test case (#51159)
Search result of diff faiss index object is not same if we use batch
insert mode.
So i refactored the test case to make sure we can compare result of
native faiss and vector index of doris
---
be/src/olap/rowset/segment_v2/ann_index_reader.cpp | 4 +-
be/src/vec/exprs/vann_topn_predicate.cpp | 5 +-
.../olap/vector_search/ann_index_reader_test.cpp | 2 +-
.../olap/vector_search/ann_index_smoke_test.cpp | 2 +-
.../olap/vector_search/ann_range_search_test.cpp | 14 +-
.../vector_search/ann_topn_descriptor_test.cpp | 42 +-
.../olap/vector_search/faiss_vector_index_test.cpp | 540 +++++++++------------
be/test/olap/vector_search/native_faiss_test.cpp | 53 +-
be/test/olap/vector_search/vector_search_utils.cpp | 249 ++++++++++
be/test/olap/vector_search/vector_search_utils.h | 74 ++-
10 files changed, 653 insertions(+), 332 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/ann_index_reader.cpp
b/be/src/olap/rowset/segment_v2/ann_index_reader.cpp
index b1fcdee341f..0222597ea32 100644
--- a/be/src/olap/rowset/segment_v2/ann_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/ann_index_reader.cpp
@@ -39,9 +39,7 @@ void AnnIndexReader::update_result(const IndexSearchResult&
search_result,
for (size_t i = 0; i < limit; ++i) {
distance[i] = search_result.distances[i];
}
- // Use the search result's row_ids to update the input row_id
- // by calculating the union in-place
- roaring &= *search_result.roaring;
+ roaring = *search_result.roaring;
}
AnnIndexReader::AnnIndexReader(const TabletIndex* index_meta,
diff --git a/be/src/vec/exprs/vann_topn_predicate.cpp
b/be/src/vec/exprs/vann_topn_predicate.cpp
index 41098e43a07..d68086dcd3d 100644
--- a/be/src/vec/exprs/vann_topn_predicate.cpp
+++ b/be/src/vec/exprs/vann_topn_predicate.cpp
@@ -147,8 +147,9 @@ Status AnnTopNDescriptor::evaluate_vector_ann_search(
result_column = ColumnFloat64::create();
ColumnFloat64* result_column_float =
assert_cast<ColumnFloat64*>(result_column.get());
- result_column_float->resize(ann_query_params.distance->size());
- for (size_t i = 0; i < ann_query_params.distance->size(); ++i) {
+ size_t num_results = ann_query_params.distance->size();
+ result_column_float->resize(num_results);
+ for (size_t i = 0; i < num_results; ++i) {
result_column_float->get_data()[i] = (*ann_query_params.distance)[i];
}
row_ids = std::move(ann_query_params.row_ids);
diff --git a/be/test/olap/vector_search/ann_index_reader_test.cpp
b/be/test/olap/vector_search/ann_index_reader_test.cpp
index b384d0c6671..e951a48c743 100644
--- a/be/test/olap/vector_search/ann_index_reader_test.cpp
+++ b/be/test/olap/vector_search/ann_index_reader_test.cpp
@@ -26,7 +26,7 @@
namespace doris::segment_v2 {
-using namespace vec_search_mock;
+using namespace vector_search_utils;
class AnnIndexReaderTest : public testing::Test {};
TEST_F(AnnIndexReaderTest, TestLoadIndex) {
diff --git a/be/test/olap/vector_search/ann_index_smoke_test.cpp
b/be/test/olap/vector_search/ann_index_smoke_test.cpp
index 03d1d543a27..b106a9bc15e 100644
--- a/be/test/olap/vector_search/ann_index_smoke_test.cpp
+++ b/be/test/olap/vector_search/ann_index_smoke_test.cpp
@@ -31,7 +31,7 @@
#include "vector_index.h"
#include "vector_search_utils.h"
-using namespace doris::vec_search_mock;
+using namespace doris::vector_search_utils;
namespace doris {
diff --git a/be/test/olap/vector_search/ann_range_search_test.cpp
b/be/test/olap/vector_search/ann_range_search_test.cpp
index 248ae3d4fd7..8b9ad847f46 100644
--- a/be/test/olap/vector_search/ann_range_search_test.cpp
+++ b/be/test/olap/vector_search/ann_range_search_test.cpp
@@ -886,14 +886,15 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch) {
idx_to_cid[3] = 3;
std::vector<std::unique_ptr<segment_v2::IndexIterator>>
cid_to_index_iterators;
cid_to_index_iterators.resize(4);
- cid_to_index_iterators[1] =
std::make_unique<doris::vec_search_mock::MockAnnIndexIterator>();
+ cid_to_index_iterators[1] =
+
std::make_unique<doris::vector_search_utils::MockAnnIndexIterator>();
std::vector<std::unique_ptr<segment_v2::ColumnIterator>> column_iterators;
column_iterators.resize(4);
column_iterators[3] =
std::make_unique<doris::segment_v2::VirtualColumnIterator>();
roaring::Roaring row_bitmap;
- doris::vec_search_mock::MockAnnIndexIterator* mock_ann_index_iter =
- dynamic_cast<doris::vec_search_mock::MockAnnIndexIterator*>(
+ doris::vector_search_utils::MockAnnIndexIterator* mock_ann_index_iter =
+ dynamic_cast<doris::vector_search_utils::MockAnnIndexIterator*>(
cid_to_index_iterators[1].get());
// Explain:
@@ -978,14 +979,15 @@ TEST_F(VectorSearchTest, TestEvaluateAnnRangeSearch2) {
idx_to_cid[3] = 3;
std::vector<std::unique_ptr<segment_v2::IndexIterator>>
cid_to_index_iterators;
cid_to_index_iterators.resize(4);
- cid_to_index_iterators[1] =
std::make_unique<doris::vec_search_mock::MockAnnIndexIterator>();
+ cid_to_index_iterators[1] =
+
std::make_unique<doris::vector_search_utils::MockAnnIndexIterator>();
std::vector<std::unique_ptr<segment_v2::ColumnIterator>> column_iterators;
column_iterators.resize(4);
column_iterators[3] =
std::make_unique<doris::segment_v2::VirtualColumnIterator>();
roaring::Roaring row_bitmap;
- doris::vec_search_mock::MockAnnIndexIterator* mock_ann_index_iter =
- dynamic_cast<doris::vec_search_mock::MockAnnIndexIterator*>(
+ doris::vector_search_utils::MockAnnIndexIterator* mock_ann_index_iter =
+ dynamic_cast<doris::vector_search_utils::MockAnnIndexIterator*>(
cid_to_index_iterators[1].get());
// Explain:
diff --git a/be/test/olap/vector_search/ann_topn_descriptor_test.cpp
b/be/test/olap/vector_search/ann_topn_descriptor_test.cpp
index b08adf73c32..7437bd80411 100644
--- a/be/test/olap/vector_search/ann_topn_descriptor_test.cpp
+++ b/be/test/olap/vector_search/ann_topn_descriptor_test.cpp
@@ -116,11 +116,40 @@ TEST_F(VectorSearchTest, AnnTopNDescriptorEvaluateTopN) {
ASSERT_TRUE(st.ok()) << fmt::format("st: {}, expr {}", st.to_string(),
predicate->get_order_by_expr_ctx()->root()->debug_string());
- std::shared_ptr<std::vector<float>> query_value =
std::make_shared<std::vector<float>>(10, 0.0);
+ const ColumnConst* const_column =
+ assert_cast<const ColumnConst*>(predicate->_query_array.get());
+ const ColumnArray* column_array =
+ assert_cast<const
ColumnArray*>(const_column->get_data_column_ptr().get());
+ const ColumnNullable* column_nullable =
+ assert_cast<const
ColumnNullable*>(column_array->get_data_ptr().get());
+ const ColumnFloat64* cf64 =
+ assert_cast<const
ColumnFloat64*>(column_nullable->get_nested_column_ptr().get());
+
+ const double* query_value = cf64->get_data().data();
+ const size_t query_value_size = cf64->get_data().size();
+ ASSERT_EQ(query_value_size, 8);
+ std::vector<float> query_value_f32;
+ for (size_t i = 0; i < query_value_size; ++i) {
+ query_value_f32.push_back(static_cast<float>(query_value[i]));
+ }
+ ASSERT_FLOAT_EQ(query_value_f32[0], 1.0f) << "query_value_f32[0] = " <<
query_value_f32[0];
+ ASSERT_FLOAT_EQ(query_value_f32[1], 2.0f) << "query_value_f32[1] = " <<
query_value_f32[1];
+ ASSERT_FLOAT_EQ(query_value_f32[2], 3.0f) << "query_value_f32[2] = " <<
query_value_f32[2];
+ ASSERT_FLOAT_EQ(query_value_f32[3], 4.0f) << "query_value_f32[3] = " <<
query_value_f32[3];
+ ASSERT_FLOAT_EQ(query_value_f32[4], 5.0f) << "query_value_f32[4] = " <<
query_value_f32[4];
+ ASSERT_FLOAT_EQ(query_value_f32[5], 6.0f) << "query_value_f32[5] = " <<
query_value_f32[5];
+ ASSERT_FLOAT_EQ(query_value_f32[6], 7.0f) << "query_value_f32[6] = " <<
query_value_f32[6];
+ ASSERT_FLOAT_EQ(query_value_f32[7], 20.0f) << "query_value_f32[7] = " <<
query_value_f32[7];
+
+ std::shared_ptr<std::vector<float>> query_vector =
+ std::make_shared<std::vector<float>>(10, 0.0);
for (size_t i = 0; i < 10; ++i) {
- (*query_value)[i] = static_cast<float>(i);
+ (*query_vector)[i] = static_cast<float>(i);
}
+ std::cout << "query_vector: " << fmt::format("[{}]",
fmt::join(*query_vector, ","))
+ << std::endl;
+
EXPECT_CALL(*_ann_index_iterator, read_from_index(testing::_))
.Times(1)
.WillOnce(testing::Invoke([](const segment_v2::IndexParam& value) {
@@ -136,17 +165,16 @@ TEST_F(VectorSearchTest, AnnTopNDescriptorEvaluateTopN) {
_result_column = ColumnFloat64::create(0, 0);
std::unique_ptr<std::vector<uint64_t>> row_ids =
std::make_unique<std::vector<uint64_t>>();
- // roaring is empry
+
roaring::Roaring roaring;
st = predicate->evaluate_vector_ann_search(_ann_index_iterator.get(),
roaring, _result_column,
row_ids);
ColumnFloat64* result_column_float =
assert_cast<ColumnFloat64*>(_result_column.get());
- for (size_t i = 0; i < query_value->size(); ++i) {
- EXPECT_EQ(result_column_float->get_data()[i], (*query_value)[i]);
+ for (size_t i = 0; i < query_vector->size(); ++i) {
+ EXPECT_EQ(result_column_float->get_data()[i], (*query_vector)[i]);
}
ASSERT_TRUE(st.ok());
- ASSERT_EQ(row_ids->size(), 0);
- ASSERT_EQ(row_ids->size(), roaring.cardinality());
+ ASSERT_EQ(row_ids->size(), 10);
}
} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/test/olap/vector_search/faiss_vector_index_test.cpp
b/be/test/olap/vector_search/faiss_vector_index_test.cpp
index 7c7b42611ec..bae49627ef2 100644
--- a/be/test/olap/vector_search/faiss_vector_index_test.cpp
+++ b/be/test/olap/vector_search/faiss_vector_index_test.cpp
@@ -31,109 +31,10 @@
#include "vector_index.h"
#include "vector_search_utils.h"
-// Generate random vectors for testing
-std::vector<float> generate_random_vector(int dim) {
- std::vector<float> vector(dim);
- std::random_device rd;
- std::mt19937 gen(rd());
- std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
-
- for (int i = 0; i < dim; i++) {
- vector[i] = dis(gen);
- }
- return vector;
-}
-
using namespace doris::segment_v2;
namespace doris::vectorized {
-// Helper function to create and configure a Doris Faiss index
-static std::unique_ptr<FaissVectorIndex> create_doris_index(
- int dimension, int m,
- FaissBuildParameter::IndexType index_type =
FaissBuildParameter::IndexType::HNSW,
- FaissBuildParameter::Quantilizer quantilizer =
FaissBuildParameter::Quantilizer::FLAT) {
- auto index = std::make_unique<FaissVectorIndex>();
-
- FaissBuildParameter params;
- params.d = dimension;
- params.m = m;
- params.index_type = index_type;
- params.quantilizer = quantilizer;
- index->set_build_params(params);
-
- return index;
-}
-
-// Helper function to create a native Faiss HNSW index
-static std::unique_ptr<faiss::IndexHNSWFlat> create_native_index(int
dimension, int m) {
- return std::make_unique<faiss::IndexHNSWFlat>(dimension, m);
-}
-
-// Helper function to generate a batch of random vectors
-static std::vector<std::vector<float>> generate_test_vectors(int num_vectors,
int dimension) {
- std::vector<std::vector<float>> vectors;
- vectors.reserve(num_vectors);
-
- for (int i = 0; i < num_vectors; i++) {
- vectors.push_back(generate_random_vector(dimension));
- }
-
- return vectors;
-}
-
-// Helper function to add vectors to both Doris and native indexes
-static void add_vectors_to_indexes(FaissVectorIndex* doris_index,
- faiss::IndexHNSWFlat* native_index,
- const std::vector<std::vector<float>>&
vectors) {
- for (size_t i = 0; i < vectors.size(); i++) {
- auto status = doris_index->add(1, vectors[i].data());
- ASSERT_TRUE(status.ok()) << "Failed to add vector to Doris index: " <<
status.to_string();
-
- native_index->add(1, vectors[i].data());
- }
-}
-
-// Helper function to print search results for comparison
-[[maybe_unused]] static void print_search_results(const IndexSearchResult&
doris_results,
- const std::vector<float>&
native_distances,
- const
std::vector<faiss::idx_t>& native_indices,
- int query_idx) {
- std::cout << "Query vector index: " << query_idx << std::endl;
-
- std::cout << "Doris Index Results:" << std::endl;
- for (int i = 0; i < doris_results.roaring->cardinality(); i++) {
- std::cout << "ID: " << doris_results.roaring->getIndex(i)
- << ", Distance: " << doris_results.distances[i] << std::endl;
- }
-
- std::cout << "Native Faiss Results:" << std::endl;
- for (size_t i = 0; i < native_indices.size(); i++) {
- if (native_indices[i] == -1) continue;
- std::cout << "ID: " << native_indices[i] << ", Distance: " <<
native_distances[i]
- << std::endl;
- }
-}
-
-// Helper function to compare search results between Doris and native Faiss
-static void compare_search_results(const IndexSearchResult& doris_results,
- const std::vector<float>& native_distances,
- const std::vector<faiss::idx_t>&
native_indices,
- float epsilon = 1e-5) {
- EXPECT_EQ(doris_results.roaring->cardinality(),
- std::count_if(native_indices.begin(), native_indices.end(),
- [](faiss::idx_t id) { return id != -1; }));
-
- for (size_t i = 0; i < native_indices.size(); i++) {
- if (native_indices[i] == -1) continue;
-
- EXPECT_TRUE(doris_results.roaring->contains(native_indices[i]))
- << "ID mismatch at rank " << i;
- EXPECT_NEAR(doris_results.distances[i], native_distances[i], epsilon)
- << "Distance mismatch at rank " << i;
- }
-}
-
// Test saving and loading an index
TEST_F(VectorSearchTest, TestSaveAndLoad) {
// Step 1: Create first index instance
@@ -151,7 +52,7 @@ TEST_F(VectorSearchTest, TestSaveAndLoad) {
const int num_vectors = 100;
std::vector<float> vectors;
for (int i = 0; i < num_vectors; i++) {
- auto tmp = generate_random_vector(params.d);
+ auto tmp = vector_search_utils::generate_random_vector(params.d);
vectors.insert(vectors.end(), tmp.begin(), tmp.end());
}
@@ -169,7 +70,7 @@ TEST_F(VectorSearchTest, TestSaveAndLoad) {
ASSERT_TRUE(load_status.ok()) << "Failed to load index: " <<
load_status.to_string();
// Step 7: Verify the loaded index works by searching
- auto query_vec = generate_random_vector(params.d);
+ auto query_vec = vector_search_utils::generate_random_vector(params.d);
const int top_k = 10;
IndexSearchParameters search_params;
@@ -221,132 +122,173 @@ TEST_F(VectorSearchTest, UpdateRoaring) {
}
TEST_F(VectorSearchTest, CompareResultWithNativeFaiss1) {
- // Step 1: Create indexes
- const int dimension = 64;
- const int max_connections = 16;
- auto doris_index = create_doris_index(dimension, max_connections);
- auto native_index = create_native_index(dimension, max_connections);
-
- // Step 2: Generate vectors and add to indexes
- const int num_vectors = 200;
- auto vectors = generate_test_vectors(num_vectors, dimension);
- add_vectors_to_indexes(doris_index.get(), native_index.get(), vectors);
-
- // Step 3: Search
- int query_idx = num_vectors / 2;
- const float* query_vec = vectors[query_idx].data();
- const int top_k = 10;
-
- // Search in Doris index
- IndexSearchParameters search_params;
- IndexSearchResult doris_results;
- auto search_status =
- doris_index->ann_topn_search(query_vec, top_k, search_params,
doris_results);
- ASSERT_EQ(search_status.ok(), true);
-
- // Search in native Faiss index
- std::vector<float> native_distances(top_k);
- std::vector<faiss::idx_t> native_indices(top_k);
- native_index->search(1, query_vec, top_k, native_distances.data(),
native_indices.data());
-
- // Step 4: Print and compare results
- // print_search_results(doris_results, native_distances, native_indices,
query_idx);
- compare_search_results(doris_results, native_distances, native_indices);
+ const size_t iterations = 50;
+ // Create random number generator
+ std::random_device rd;
+ std::mt19937 gen(rd());
+ // Define fixed parameter sets to choose from
+ const std::vector<int> dimensions = {32, 64, 128, 256};
+ const std::vector<int> max_connections = {8, 16, 32, 64};
+ const std::vector<int> vector_counts = {100, 200, 500, 1000};
+
+ for (size_t iter = 0; iter < iterations; ++iter) {
+ // Randomly select parameters from the fixed sets
+ const int dimension =
+ dimensions[std::uniform_int_distribution<>(0,
dimensions.size() - 1)(gen)];
+ const int max_connection =
max_connections[std::uniform_int_distribution<>(
+ 0, max_connections.size() - 1)(gen)];
+ const int num_vectors =
+ vector_counts[std::uniform_int_distribution<>(0,
vector_counts.size() - 1)(gen)];
+
+ // Step 1: Create indexes
+ auto doris_index = doris::vector_search_utils::create_doris_index(
+ doris::vector_search_utils::IndexType::HNSW, dimension,
max_connection);
+ auto native_index = doris::vector_search_utils::create_native_index(
+ doris::vector_search_utils::IndexType::HNSW, dimension,
max_connection);
+
+ // Step 2: Generate vectors and add to indexes
+ auto vectors =
+
doris::vector_search_utils::generate_test_vectors_matrix(num_vectors,
dimension);
+
doris::vector_search_utils::add_vectors_to_indexes_serial_mode(doris_index.get(),
+
native_index.get(), vectors);
+
+ // Step 3: Search
+ int query_idx = num_vectors / 2;
+ const float* query_vec = vectors[query_idx].data();
+ const int top_k = 10;
+
+ // Search in Doris index
+ IndexSearchParameters search_params;
+ IndexSearchResult doris_results;
+ auto search_status =
+ doris_index->ann_topn_search(query_vec, top_k, search_params,
doris_results);
+ ASSERT_EQ(search_status.ok(), true)
+ << "Search failed with dimension=" << dimension
+ << ", max_connections=" << max_connection << ", num_vectors="
<< num_vectors;
+
+ // Search in native Faiss index
+ std::vector<float> native_distances(top_k);
+ std::vector<faiss::idx_t> native_indices(top_k);
+ native_index->search(1, query_vec, top_k, native_distances.data(),
native_indices.data());
+
+ // Step 4: Compare results
+ vector_search_utils::compare_search_results(doris_results,
native_distances,
+ native_indices);
+ }
}
TEST_F(VectorSearchTest, CompareResultWithNativeFaiss2) {
- // Step 1: Create indexes
- const int dimension = 64;
- const int max_connections = 16;
- auto doris_index = create_doris_index(dimension, max_connections);
- auto native_index = create_native_index(dimension, max_connections);
-
- // Step 2: Generate vectors and add to indexes
- const int num_vectors = 500;
- auto vectors = generate_test_vectors(num_vectors, dimension);
- add_vectors_to_indexes(doris_index.get(), native_index.get(), vectors);
-
- // Step 3: Search
- int query_idx = num_vectors / 2;
- const float* query_vec = vectors[query_idx].data();
- const int top_k = num_vectors;
-
- // Search in Doris index
- IndexSearchParameters search_params;
- IndexSearchResult doris_results;
- auto search_status =
- doris_index->ann_topn_search(query_vec, top_k, search_params,
doris_results);
- ASSERT_EQ(search_status.ok(), true);
-
- // Search in native Faiss index
- std::vector<float> native_distances(top_k, -1);
- std::vector<faiss::idx_t> native_indices(top_k, -1);
- native_index->search(1, query_vec, top_k, native_distances.data(),
native_indices.data());
-
- // Step 4: Print and compare results
- // print_search_results(doris_results, native_distances, native_indices,
query_idx);
- compare_search_results(doris_results, native_distances, native_indices);
+ const size_t iterations = 50;
+ // Create random number generator
+ std::random_device rd;
+ std::mt19937 gen(rd());
+ // Define fixed parameter sets to choose from
+ const std::vector<int> dimensions = {32, 64, 128, 256};
+ const std::vector<int> max_connections = {8, 16, 32, 64};
+ const std::vector<int> vector_counts = {100, 200, 500, 1000};
+
+ for (size_t i = 0; i < iterations; ++i) {
+ // Randomly select parameters from the fixed sets
+ const int dimension =
+ dimensions[std::uniform_int_distribution<>(0,
dimensions.size() - 1)(gen)];
+ const int max_connection =
max_connections[std::uniform_int_distribution<>(
+ 0, max_connections.size() - 1)(gen)];
+ const int num_vectors =
+ vector_counts[std::uniform_int_distribution<>(0,
vector_counts.size() - 1)(gen)];
+
+ // Step 1: Create indexes
+ auto doris_index = doris::vector_search_utils::create_doris_index(
+ doris::vector_search_utils::IndexType::HNSW, dimension,
max_connection);
+ auto native_index = doris::vector_search_utils::create_native_index(
+ doris::vector_search_utils::IndexType::HNSW, dimension,
max_connection);
+
+ // Step 2: Generate vectors and add to indexes
+ std::vector<std::vector<float>> vectors =
+
doris::vector_search_utils::generate_test_vectors_matrix(num_vectors,
dimension);
+
doris::vector_search_utils::add_vectors_to_indexes_serial_mode(doris_index.get(),
+
native_index.get(), vectors);
+
+ // Step 3: Search
+ int query_idx = num_vectors / 2;
+ const float* query_vec = vectors[query_idx].data();
+ const int top_k = num_vectors;
+ IndexSearchParameters search_params;
+ IndexSearchResult doris_results;
+ std::ignore = doris_index->ann_topn_search(query_vec, top_k,
search_params, doris_results);
+
+ // Search in native Faiss index
+ std::vector<float> native_distances(top_k, -1);
+ std::vector<faiss::idx_t> native_indices(top_k, -1);
+ native_index->search(1, query_vec, top_k, native_distances.data(),
native_indices.data());
+
+ // Step 4: Compare results
+ doris::vector_search_utils::compare_search_results(doris_results,
native_distances,
+ native_indices);
+ }
}
TEST_F(VectorSearchTest, SearchAllVectors) {
- // Step 1: Create and build index
- auto index1 = std::make_unique<FaissVectorIndex>();
+ size_t iterations = 25;
+ for (size_t i = 0; i < iterations; ++i) {
+ // Step 1: Create and build index
+ auto index1 = std::make_unique<FaissVectorIndex>();
- FaissBuildParameter params;
- params.d = 64;
- params.m = 32;
- params.index_type = FaissBuildParameter::IndexType::HNSW;
- params.quantilizer = FaissBuildParameter::Quantilizer::FLAT;
- index1->set_build_params(params);
+ FaissBuildParameter params;
+ params.d = 64;
+ params.m = 32;
+ params.index_type = FaissBuildParameter::IndexType::HNSW;
+ params.quantilizer = FaissBuildParameter::Quantilizer::FLAT;
+ index1->set_build_params(params);
- // Add 500 vectors
- const int num_vectors = 500;
- std::vector<float> vectors;
- for (int i = 0; i < num_vectors; i++) {
- auto vec = generate_random_vector(params.d);
- vectors.insert(vectors.end(), vec.begin(), vec.end());
- }
+ // Add 500 vectors
+ const int num_vectors = 500;
+ std::vector<float> vectors;
+ for (int i = 0; i < num_vectors; i++) {
+ auto vec =
doris::vector_search_utils::generate_random_vector(params.d);
+ vectors.insert(vectors.end(), vec.begin(), vec.end());
+ }
- ASSERT_EQ(index1->add(500, vectors.data()).ok(), true);
+ ASSERT_EQ(index1->add(500, vectors.data()).ok(), true);
- // Save index
- ASSERT_TRUE(index1->save(_ram_dir.get()).ok());
+ // Save index
+ ASSERT_TRUE(index1->save(_ram_dir.get()).ok());
- // Step 2: Load index
- auto index2 = std::make_unique<FaissVectorIndex>();
- ASSERT_TRUE(index2->load(_ram_dir.get()).ok());
+ // Step 2: Load index
+ auto index2 = std::make_unique<FaissVectorIndex>();
+ ASSERT_TRUE(index2->load(_ram_dir.get()).ok());
- // Step 3: Search all vectors
- IndexSearchParameters search_params;
- IndexSearchResult search_result;
-
- // Search for all vectors - use a vector we know is in the index
- std::vector<float> query_vec {vectors.begin(),
- vectors.begin() + params.d}; // Use the
first vector we added
- const int top_k = num_vectors; // Get all
vectors
-
- ASSERT_EQ(index2->ann_topn_search(query_vec.data(), top_k, search_params,
search_result).ok(),
- true);
- std::cout << "Search returned " << search_result.roaring->cardinality() <<
" results\n";
- // Step 4: Verify we got all vectors back
- // Note: In practical ANN search with approximate algorithms like HNSW,
- // we might not get exactly all vectors due to the nature of approximate
search.
- // So we verify we got a reasonable number back.
- EXPECT_GE(search_result.roaring->cardinality(), num_vectors * 0.60)
- << "Expected to find at least 60% of all vectors";
-
- // Also verify the first result is the query vector itself (it should be
an exact match)
- ASSERT_EQ(search_result.roaring->isEmpty(), false) << "Search result
should not be empty";
- size_t first = search_result.roaring->getIndex(0);
- std::vector<float> first_result_vec(vectors.begin() + first * params.d,
- vectors.begin() + (first + 1) *
params.d);
- std::string query_vec_str = fmt::format("[{}]", fmt::join(query_vec, ","));
- std::string first_result_vec_str = fmt::format("[{}]",
fmt::join(first_result_vec, ","));
- EXPECT_EQ(first_result_vec, query_vec) << "First result should be the
query vector itself";
+ // Step 3: Search all vectors
+ IndexSearchParameters search_params;
+ IndexSearchResult search_result;
+
+ // Search for all vectors - use a vector we know is in the index
+ std::vector<float> query_vec {vectors.begin(),
+ vectors.begin() + params.d}; // Use the
first vector we added
+ const int top_k = num_vectors; // Get all
vectors
+
+ ASSERT_EQ(
+ index2->ann_topn_search(query_vec.data(), top_k,
search_params, search_result).ok(),
+ true);
+ // Step 4: Verify we got all vectors back
+ // Note: In practical ANN search with approximate algorithms like HNSW,
+ // we might not get exactly all vectors due to the nature of
approximate search.
+ // So we verify we got a reasonable number back.
+ EXPECT_GE(search_result.roaring->cardinality(), num_vectors * 0.60)
+ << "Expected to find at least 60% of all vectors";
+
+ // Also verify the first result is the query vector itself (it should
be an exact match)
+ ASSERT_EQ(search_result.roaring->isEmpty(), false) << "Search result
should not be empty";
+ size_t first = search_result.roaring->getIndex(0);
+ std::vector<float> first_result_vec(vectors.begin() + first * params.d,
+ vectors.begin() + (first + 1) *
params.d);
+ std::string query_vec_str = fmt::format("[{}]", fmt::join(query_vec,
","));
+ std::string first_result_vec_str = fmt::format("[{}]",
fmt::join(first_result_vec, ","));
+ EXPECT_EQ(first_result_vec, query_vec) << "First result should be the
query vector itself";
+ }
}
TEST_F(VectorSearchTest, CompRangeSearch) {
- size_t iterations = 50;
+ size_t iterations = 25;
for (size_t i = 0; i < iterations; ++i) {
// Random parameters for each test iteration
std::random_device rd;
@@ -367,35 +309,36 @@ TEST_F(VectorSearchTest, CompRangeSearch) {
index1->set_build_params(params);
const int num_vectors = random_n;
- std::vector<float> vectors;
+ std::vector<std::vector<float>> vectors;
for (int i = 0; i < num_vectors; i++) {
- auto vec = generate_random_vector(params.d);
- vectors.insert(vectors.end(), vec.begin(), vec.end());
+ auto vec = vector_search_utils::generate_random_vector(params.d);
+ vectors.push_back(vec);
}
std::unique_ptr<faiss::Index> native_index =
std::make_unique<faiss::IndexHNSWFlat>(params.d, params.m);
- native_index->add(num_vectors, vectors.data());
- std::ignore = index1->add(num_vectors, vectors.data());
+
doris::vector_search_utils::add_vectors_to_indexes_serial_mode(index1.get(),
+
native_index.get(), vectors);
- std::vector<float> query_vec {vectors.begin(),
- vectors.begin() + params.d}; // Use the
first vector we added
+ std::vector<float> query_vec = vectors.front();
std::vector<std::pair<size_t, float>> distances(num_vectors);
for (int i = 0; i < num_vectors; i++) {
double sum = 0;
+ auto& vec = vectors[i];
for (int j = 0; j < params.d; j++) {
- accumulate(vectors[i * params.d + j], query_vec[j], sum);
+ accumulate(vec[j], query_vec[j], sum);
}
distances[i] = std::make_pair(i, finalize(sum));
}
std::sort(distances.begin(), distances.end(),
[](const auto& a, const auto& b) { return a.second <
b.second; });
- // Use the median distance as the radius
- float radius = distances[num_vectors / 2].second;
+
+ float radius = distances[num_vectors / 4].second;
HNSWSearchParameters hnsw_params;
hnsw_params.ef_search = 16; // Set efSearch for better accuracy
hnsw_params.roaring = nullptr; // No selector for this test
+ hnsw_params.is_le_or_lt = true;
IndexSearchResult doris_result;
std::ignore = index1->range_search(query_vec.data(), radius,
hnsw_params, doris_result);
@@ -452,30 +395,31 @@ TEST_F(VectorSearchTest, RangeSearchNoSelector1) {
index1->set_build_params(params);
const int num_vectors = random_n;
- std::vector<float> vectors;
+ std::vector<std::vector<float>> vectors;
for (int i = 0; i < num_vectors; i++) {
- auto vec = generate_random_vector(params.d);
- vectors.insert(vectors.end(), vec.begin(), vec.end());
+ auto vec = vector_search_utils::generate_random_vector(params.d);
+ vectors.push_back(vec);
}
+ std::unique_ptr<faiss::Index> native_index =
+ std::make_unique<faiss::IndexHNSWFlat>(params.d, params.m);
+
doris::vector_search_utils::add_vectors_to_indexes_serial_mode(index1.get(),
+
native_index.get(), vectors);
+
+ std::vector<float> query_vec = vectors.front();
std::vector<std::pair<size_t, float>> distances(num_vectors);
for (int i = 0; i < num_vectors; i++) {
double sum = 0;
+ auto& vec = vectors[i];
for (int j = 0; j < params.d; j++) {
- accumulate(vectors[i * params.d + j], vectors[j], sum);
+ accumulate(vec[j], query_vec[j], sum);
}
distances[i] = std::make_pair(i, finalize(sum));
}
std::sort(distances.begin(), distances.end(),
[](const auto& a, const auto& b) { return a.second <
b.second; });
- // Use the median distance as the radius
- float radius = distances[num_vectors / 2].second;
-
- std::unique_ptr<faiss::Index> native_index =
- std::make_unique<faiss::IndexHNSWFlat>(params.d, params.m,
faiss::METRIC_L2);
- native_index->add(num_vectors, vectors.data());
- std::ignore = index1->add(num_vectors, vectors.data());
+ float radius = distances[num_vectors / 4].second;
// Save index
ASSERT_TRUE(index1->save(_ram_dir.get()).ok());
@@ -485,8 +429,6 @@ TEST_F(VectorSearchTest, RangeSearchNoSelector1) {
// Step 3: Range search
// Use a vector we know is in the index
- std::vector<float> query_vec {vectors.begin(),
- vectors.begin() + params.d}; // Use the
first vector we added
faiss::SearchParametersHNSW search_params;
search_params.efSearch = 16; // Set efSearch for better accuracy
@@ -568,17 +510,20 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) {
index1->set_build_params(params);
const int num_vectors = 1000;
- std::vector<float> vectors;
+ std::vector<std::vector<float>> vectors;
for (int i = 0; i < num_vectors; i++) {
- auto vec = generate_random_vector(params.d);
- vectors.insert(vectors.end(), vec.begin(), vec.end());
+ auto vec = vector_search_utils::generate_random_vector(params.d);
+ vectors.push_back(vec);
}
+ // Use a vector we know is in the index
+ std::vector<float> query_vec = vectors.front();
std::vector<std::pair<size_t, float>> distances(num_vectors);
for (int i = 0; i < num_vectors; i++) {
double sum = 0;
+ auto& vec = vectors[i];
for (int j = 0; j < params.d; j++) {
- accumulate(vectors[i * params.d + j], vectors[j], sum);
+ accumulate(vec[j], query_vec[j], sum);
}
distances[i] = std::make_pair(i, finalize(sum));
}
@@ -589,8 +534,8 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) {
std::unique_ptr<faiss::Index> native_index =
std::make_unique<faiss::IndexHNSWFlat>(params.d, params.m,
faiss::METRIC_L2);
- native_index->add(num_vectors, vectors.data());
- std::ignore = index1->add(num_vectors, vectors.data());
+
doris::vector_search_utils::add_vectors_to_indexes_serial_mode(index1.get(),
+
native_index.get(), vectors);
std::unique_ptr<roaring::Roaring> all_rows =
std::make_unique<roaring::Roaring>();
std::unique_ptr<roaring::Roaring> sel_rows =
std::make_unique<roaring::Roaring>();
@@ -600,11 +545,8 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) {
sel_rows->add(i);
}
}
- // Step 3: Range search
- // Use a vector we know is in the index
- std::vector<float> query_vec {vectors.begin(),
- vectors.begin() + params.d}; // Use the
first vector we added
+ // Step 3: Range search
faiss::SearchParametersHNSW search_params;
search_params.efSearch = 16; // Set efSearch for better accuracy
auto faiss_selector =
segment_v2::FaissVectorIndex::roaring_to_faiss_selector(*sel_rows);
@@ -612,7 +554,7 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) {
faiss::RangeSearchResult native_search_result(1, true);
native_index->range_search(1, query_vec.data(), radius * radius,
&native_search_result,
&search_params);
-
+ // labels and distance
std::vector<std::pair<int, float>> native_results;
size_t begin = native_search_result.lims[0];
size_t end = native_search_result.lims[1];
@@ -622,7 +564,7 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) {
}
HNSWSearchParameters doris_search_params;
- doris_search_params.ef_search = 16; // Set efSearch for better accuracy
+ doris_search_params.ef_search = search_params.efSearch;
doris_search_params.is_le_or_lt = true;
doris_search_params.roaring = sel_rows.get();
IndexSearchResult doris_search_result;
@@ -644,11 +586,17 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) {
}
doris_search_params.is_le_or_lt = false;
- roaring::Roaring ge_rows = *doris_search_params.roaring;
+ IndexSearchResult doris_search_result2;
+ ASSERT_EQ(index1->range_search(query_vec.data(), radius,
doris_search_params,
+ doris_search_result2)
+ .ok(),
+ true);
+ roaring::Roaring ge_rows = *doris_search_result2.roaring;
roaring::Roaring less_rows;
for (size_t i = 0; i < native_results.size(); ++i) {
less_rows.add(native_results[i].first);
}
+ // result2 contains all rows that not included by result1
roaring::Roaring and_row_id = ge_rows & less_rows;
roaring::Roaring or_row_id = ge_rows | less_rows;
ASSERT_NEAR(and_row_id.cardinality(), 0, 1);
@@ -659,14 +607,12 @@ TEST_F(VectorSearchTest, RangeSearchWithSelector1) {
TEST_F(VectorSearchTest, RangeSearchEmptyResult) {
for (size_t i = 0; i < 10; ++i) {
- auto index1 = std::make_unique<FaissVectorIndex>();
- FaissBuildParameter params;
- params.d = 10;
- params.m = 32;
- params.index_type = FaissBuildParameter::IndexType::HNSW;
- index1->set_build_params(params);
-
+ const size_t d = 10;
+ const size_t m = 32;
const int num_vectors = 1000;
+ auto index1 =
+
vector_search_utils::create_doris_index(vector_search_utils::IndexType::HNSW,
d, m);
+
std::vector<float> vectors;
// Create 1000 vectors and make sure their l2_distance with
[1,2,3,4,5,6,7,8,9,10] is less than 100
// [1,2,3,4,5,6,7,8,9,10]
@@ -679,73 +625,66 @@ TEST_F(VectorSearchTest, RangeSearchEmptyResult) {
rowid = rowid % 10;
}
- std::vector<float> vec(params.d);
- for (int colid = 0; colid < params.d; colid++) {
+ std::vector<float> vec(d);
+ for (int colid = 0; colid < d; colid++) {
vec[colid] = (rowid + colid) % 10 + 1;
}
vectors.insert(vectors.end(), vec.begin(), vec.end());
}
- // Find the min
- float radius = 5.0f;
-
- std::unique_ptr<faiss::Index> native_index =
- std::make_unique<faiss::IndexHNSWFlat>(params.d, params.m,
faiss::METRIC_L2);
-
- native_index->add(num_vectors, vectors.data());
- std::ignore = index1->add(num_vectors, vectors.data());
+ std::unique_ptr<faiss::Index> native_index =
vector_search_utils::create_native_index(
+ vector_search_utils::IndexType::HNSW, d, m);
+ vector_search_utils::add_vectors_to_indexes_batch_mode(index1.get(),
native_index.get(),
+ num_vectors,
vectors);
std::vector<float> query_vec;
- for (int i = 0; i < params.d; i++) {
+ for (int i = 0; i < d; i++) {
query_vec.push_back(5.0f);
}
- // L2 distance between [5,5,5,5,5,5,5,5,5,5] with any vector add added
is large than 5 and less than 250.
- faiss::SearchParametersHNSW search_params;
- search_params.efSearch = 1000; // Set efSearch for better accuracy
- faiss::RangeSearchResult native_search_result(1, true);
- native_index->range_search(1, query_vec.data(), radius * radius,
&native_search_result);
+ // L2 distance between [5,5,5,5,5,5,5,5,5,5] with any other vector is
large than 5 and less than 250.
+ // Find the min
+ float radius = 5.0f;
+ doris::segment_v2::HNSWSearchParameters search_params;
+ search_params.ef_search = 1000; // Set efSearch for better accuracy
+ auto doris_search_result =
vector_search_utils::perform_doris_index_range_search(
+ index1.get(), query_vec.data(), radius, search_params);
+ auto native_search_result =
vector_search_utils::perform_native_index_range_search(
+ native_index.get(), query_vec.data(), radius);
- std::vector<std::pair<int, float>> native_results;
- size_t begin = native_search_result.lims[0];
- size_t end = native_search_result.lims[1];
- for (size_t i = begin; i < end; i++) {
- native_results.push_back(
- {native_search_result.labels[i],
native_search_result.distances[i]});
- }
+ ASSERT_EQ(doris_search_result->roaring->cardinality(), 0);
+ ASSERT_EQ(native_search_result.size(), 0);
- HNSWSearchParameters doris_search_params;
- doris_search_params.ef_search = 1000; // Set efSearch for better
accuracy
- doris_search_params.is_le_or_lt = true;
+ // Search all rows.
+ doris::segment_v2::HNSWSearchParameters search_params_all_rows;
+ search_params_all_rows.ef_search = 1000; // Set efSearch for better
accuracy
+ search_params_all_rows.is_le_or_lt = true;
std::unique_ptr<roaring::Roaring> sel_rows =
std::make_unique<roaring::Roaring>();
for (size_t i = 0; i < num_vectors; ++i) {
sel_rows->add(i);
}
+ search_params_all_rows.roaring = sel_rows.get();
+ doris_search_result =
vector_search_utils::perform_doris_index_range_search(
+ index1.get(), query_vec.data(), radius,
search_params_all_rows);
+ ASSERT_EQ(doris_search_result->distances != nullptr, true);
- // Search all rows.
- doris_search_params.roaring = sel_rows.get();
- IndexSearchResult doris_search_result;
-
- ASSERT_EQ(index1->range_search(query_vec.data(), radius,
doris_search_params,
- doris_search_result)
- .ok(),
- true);
+ native_search_result =
vector_search_utils::perform_native_index_range_search(
+ native_index.get(), query_vec.data(), radius);
- ASSERT_EQ(native_results.size(),
doris_search_result.roaring->cardinality());
- ASSERT_EQ(doris_search_result.distances != nullptr, true);
- for (size_t i = 0; i < native_results.size(); i++) {
- const size_t rowid = native_results[i].first;
- const float dis = native_results[i].second;
- ASSERT_EQ(doris_search_result.roaring->contains(rowid), true)
+ // Make sure result is same
+ for (size_t i = 0; i < native_search_result.size(); i++) {
+ const size_t rowid = native_search_result[i].first;
+ const float dis = native_search_result[i].second;
+ ASSERT_EQ(doris_search_result->roaring->contains(rowid), true)
<< "Row ID mismatch at rank " << i;
- ASSERT_FLOAT_EQ(doris_search_result.distances[i], sqrt(dis))
+ ASSERT_FLOAT_EQ(doris_search_result->distances[i], sqrt(dis))
<< "Distance mismatch at rank " << i;
}
- doris_search_params.is_le_or_lt = false;
- roaring::Roaring ge_rows = *doris_search_params.roaring;
+ search_params_all_rows.is_le_or_lt = false;
+ roaring::Roaring ge_rows = *search_params_all_rows.roaring;
roaring::Roaring less_rows;
- for (size_t i = 0; i < native_results.size(); ++i) {
- less_rows.add(native_results[i].first);
+ for (size_t i = 0; i < native_search_result.size(); ++i) {
+ less_rows.add(native_search_result[i].first);
}
roaring::Roaring and_row_id = ge_rows & less_rows;
roaring::Roaring or_row_id = ge_rows | less_rows;
@@ -762,4 +701,5 @@ TEST_F(VectorSearchTest, TestIdSelectorWithEmptyRoaring) {
ASSERT_EQ(sel->is_member(i), false) << "Selector should be empty";
}
}
+
} // namespace doris::vectorized
\ No newline at end of file
diff --git a/be/test/olap/vector_search/native_faiss_test.cpp
b/be/test/olap/vector_search/native_faiss_test.cpp
index dc7b5da3046..1cfdd12019f 100644
--- a/be/test/olap/vector_search/native_faiss_test.cpp
+++ b/be/test/olap/vector_search/native_faiss_test.cpp
@@ -25,9 +25,11 @@
#include <iomanip>
#include <roaring/roaring.hh>
-#include "vector/faiss_vector_index.h"
+#include "olap/vector_search/vector_search_utils.h"
+namespace doris::vector_search_utils {
std::vector<float> generate_random_vector(int dim);
+}
class NativeFaissTest : public ::testing::Test {
public:
@@ -233,7 +235,7 @@ TEST_F(NativeFaissTest, TestRangeSearch10000Random) {
int n = 1000;
std::vector<float> data_vectors;
for (int i = 0; i < n; ++i) {
- auto random_vector = generate_random_vector(dim);
+ auto random_vector =
doris::vector_search_utils::generate_random_vector(dim);
data_vectors.insert(data_vectors.end(), random_vector.begin(),
random_vector.end());
}
@@ -287,7 +289,7 @@ TEST_F(NativeFaissTest,
TestRangeSearchRandomVectorsSearchMedian) {
// Generate random vectors
std::vector<float> data_vectors;
for (int i = 0; i < n; ++i) {
- auto random_vector = generate_random_vector(dim);
+ auto random_vector =
doris::vector_search_utils::generate_random_vector(dim);
data_vectors.insert(data_vectors.end(),
random_vector.begin(),
random_vector.end());
}
@@ -367,13 +369,15 @@ TEST_F(NativeFaissTest, TestTopNRandomVectorSearchMedian)
{
// Generate random vectors
std::vector<float> data_vectors;
for (int i = 0; i < n; ++i) {
- auto random_vector = generate_random_vector(dim);
+ auto random_vector =
+
doris::vector_search_utils::generate_random_vector(dim);
data_vectors.insert(data_vectors.end(),
random_vector.begin(),
random_vector.end());
}
// Use a random vector as query
- std::vector<float> query_vector =
generate_random_vector(dim);
+ std::vector<float> query_vector =
+
doris::vector_search_utils::generate_random_vector(dim);
// Brute force search to find ground truth top-k
std::vector<std::pair<int, float>> all_distances(n);
@@ -436,7 +440,7 @@ TEST_F(NativeFaissTest, SameTypeIndexDiffObject) {
int n = 1000;
std::vector<float> data_vectors;
for (int i = 0; i < n; ++i) {
- auto random_vector = generate_random_vector(dim);
+ auto random_vector =
doris::vector_search_utils::generate_random_vector(dim);
data_vectors.insert(data_vectors.end(), random_vector.begin(),
random_vector.end());
}
@@ -475,4 +479,41 @@ TEST_F(NativeFaissTest, SameTypeIndexDiffObject) {
ASSERT_FLOAT_EQ(actual_results1[i].second, actual_results2[i].second);
ASSERT_FLOAT_EQ(actual_results1[i].second, actual_results3[i].second);
}
+}
+
+TEST_F(NativeFaissTest, BatchInsert) {
+ size_t iterations = 5;
+ for (size_t i = 0; i < iterations; ++i) {
+ const size_t d = 100;
+ const size_t m = 32;
+ const int num_vectors = 1000;
+
+ auto index1 = doris::vector_search_utils::create_native_index(
+ doris::vector_search_utils::IndexType::HNSW, d, m);
+
+ auto index2 = doris::vector_search_utils::create_native_index(
+ doris::vector_search_utils::IndexType::HNSW, d, m);
+
+ auto flatten_vectors =
+
doris::vector_search_utils::generate_test_vectors_flatten(num_vectors, d);
+
+ doris::vector_search_utils::add_vectors_to_indexes_batch_mode(nullptr,
index1.get(),
+
num_vectors, flatten_vectors);
+ doris::vector_search_utils::add_vectors_to_indexes_batch_mode(nullptr,
index2.get(),
+
num_vectors, flatten_vectors);
+
+ // Use a random vector as query
+ std::vector<float> query_vector =
doris::vector_search_utils::generate_random_vector(d);
+ float radius =
doris::vector_search_utils::get_radius_from_flatten(query_vector.data(), d,
+
flatten_vectors, 0.4);
+
+ // Get index-based results using HNSW index
+ auto res1 =
doris::vector_search_utils::perform_native_index_range_search(
+ index1.get(), query_vector.data(), radius);
+ auto res2 =
doris::vector_search_utils::perform_native_index_range_search(
+ index2.get(), query_vector.data(), radius);
+ // Insert vector using batch insert will make result is different.
+ std::cout << "res1 size: " << res1.size() << std::endl;
+ std::cout << "res2 size: " << res2.size() << std::endl;
+ }
}
\ No newline at end of file
diff --git a/be/test/olap/vector_search/vector_search_utils.cpp
b/be/test/olap/vector_search/vector_search_utils.cpp
new file mode 100644
index 00000000000..6ee03666bbf
--- /dev/null
+++ b/be/test/olap/vector_search/vector_search_utils.cpp
@@ -0,0 +1,249 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vector_search_utils.h"
+
+#include <faiss/IndexHNSW.h>
+
+#include <cstddef>
+#include <memory>
+
+#include "faiss_vector_index.h"
+#include "vector_index.h"
+
+namespace doris::vector_search_utils {
+static void accumulate(double x, double y, double& sum) {
+ sum += (x - y) * (x - y);
+}
+static double finalize(double sum) {
+ return sqrt(sum);
+}
+
+// Generate random vectors for testing
+std::vector<float> generate_random_vector(int dim) {
+ std::vector<float> vector(dim);
+ std::random_device rd;
+ std::mt19937 gen(rd());
+ std::uniform_real_distribution<float> dis(-1.0f, 1.0f);
+
+ for (int i = 0; i < dim; i++) {
+ vector[i] = dis(gen);
+ }
+ return vector;
+}
+
+// Helper function to create and configure a Doris Vector index
+std::unique_ptr<doris::segment_v2::VectorIndex> create_doris_index(IndexType
index_type,
+ int
dimension, int m) {
+ auto index = std::make_unique<doris::segment_v2::FaissVectorIndex>();
+ segment_v2::FaissBuildParameter params;
+ params.d = dimension;
+ params.m = m;
+ switch (index_type) {
+ case IndexType::FLAT_L2:
+ params.index_type =
segment_v2::FaissBuildParameter::IndexType::BruteForce;
+ break;
+ case IndexType::HNSW:
+ params.index_type = segment_v2::FaissBuildParameter::IndexType::HNSW;
+ break;
+ default:
+ throw std::invalid_argument("Unsupported index type");
+ }
+ params.quantilizer = segment_v2::FaissBuildParameter::Quantilizer::FLAT;
+ index->set_build_params(params);
+ return std::move(index);
+}
+
+// Helper function to create a native Faiss index
+std::unique_ptr<faiss::Index> create_native_index(IndexType type, int
dimension, int m) {
+ std::unique_ptr<faiss::Index> index;
+
+ switch (type) {
+ case IndexType::FLAT_L2:
+ index = std::make_unique<faiss::IndexFlatL2>(dimension);
+ break;
+ case IndexType::HNSW:
+ index = std::make_unique<faiss::IndexHNSWFlat>(dimension, m,
faiss::METRIC_L2);
+ break;
+ default:
+ throw std::invalid_argument("Unsupported index type");
+ }
+
+ return index;
+}
+
+// Helper function to generate a batch of random vectors
+std::vector<std::vector<float>> generate_test_vectors_matrix(int num_vectors,
int dimension) {
+ std::vector<std::vector<float>> vectors;
+ vectors.reserve(num_vectors);
+
+ for (int i = 0; i < num_vectors; i++) {
+ vectors.push_back(generate_random_vector(dimension));
+ }
+
+ return vectors;
+}
+
+std::vector<float> generate_test_vectors_flatten(int num_vectors, int
dimension) {
+ std::vector<float> vectors;
+ vectors.reserve(num_vectors * dimension);
+
+ for (int i = 0; i < num_vectors; i++) {
+ auto tmp = generate_random_vector(dimension);
+ vectors.insert(vectors.end(), tmp.begin(), tmp.end());
+ }
+
+ return vectors;
+}
+
+// Helper function to add vectors to both Doris and native indexes
+void add_vectors_to_indexes_serial_mode(segment_v2::VectorIndex* doris_index,
+ faiss::Index* native_index,
+ const std::vector<std::vector<float>>&
vectors) {
+ for (size_t i = 0; i < vectors.size(); i++) {
+ if (doris_index) {
+ auto status = doris_index->add(1, vectors[i].data());
+ ASSERT_TRUE(status.ok())
+ << "Failed to add vector to Doris index: " <<
status.to_string();
+ }
+ if (native_index) {
+ // Add vector to native Faiss index
+ native_index->add(1, vectors[i].data());
+ }
+ }
+}
+
+void add_vectors_to_indexes_batch_mode(segment_v2::VectorIndex* doris_index,
+ faiss::Index* native_index, size_t
num_vectors,
+ const std::vector<float>&
flatten_vectors) {
+ if (doris_index) {
+ auto status = doris_index->add(num_vectors, flatten_vectors.data());
+ ASSERT_TRUE(status.ok()) << "Failed to add vectors to Doris index: "
<< status.to_string();
+ }
+
+ if (native_index) {
+ // Add vectors to native Faiss index
+ native_index->add(num_vectors, flatten_vectors.data());
+ }
+}
+
+// Helper function to print search results for comparison
+void print_search_results(const segment_v2::IndexSearchResult& doris_results,
+ const std::vector<float>& native_distances,
+ const std::vector<faiss::idx_t>& native_indices, int
query_idx) {
+ std::cout << "Query vector index: " << query_idx << std::endl;
+
+ std::cout << "Doris Index Results:" << std::endl;
+ for (int i = 0; i < doris_results.roaring->cardinality(); i++) {
+ std::cout << "ID: " << doris_results.roaring->getIndex(i)
+ << ", Distance: " << doris_results.distances[i] << std::endl;
+ }
+
+ std::cout << "Native Faiss Results:" << std::endl;
+ for (size_t i = 0; i < native_indices.size(); i++) {
+ if (native_indices[i] == -1) continue;
+ std::cout << "ID: " << native_indices[i] << ", Distance: " <<
native_distances[i]
+ << std::endl;
+ }
+}
+
+// Helper function to compare search results between Doris and native Faiss
+void compare_search_results(const segment_v2::IndexSearchResult& doris_results,
+ const std::vector<float>& native_distances,
+ const std::vector<faiss::idx_t>& native_indices,
float abs_error) {
+ EXPECT_EQ(doris_results.roaring->cardinality(),
+ std::count_if(native_indices.begin(), native_indices.end(),
+ [](faiss::idx_t id) { return id != -1; }));
+
+ for (size_t i = 0; i < native_indices.size(); i++) {
+ if (native_indices[i] == -1) continue;
+
+ EXPECT_TRUE(doris_results.roaring->contains(native_indices[i]))
+ << "ID mismatch at rank " << i;
+ EXPECT_NEAR(doris_results.distances[i], native_distances[i], abs_error)
+ << "Distance mismatch at rank " << i;
+ }
+}
+
+// result is a vector of pairs, where each pair contains the labels and
distance
+// result is sorted by labels
+std::vector<std::pair<int, float>>
perform_native_index_range_search(faiss::Index* index,
+ const
float* query_vector,
+ float
radius) {
+ std::vector<std::pair<int, float>> results;
+ faiss::RangeSearchResult result(1);
+ index->range_search(1, query_vector, radius * radius, &result);
+ size_t begin = result.lims[0];
+ size_t end = result.lims[1];
+ results.reserve(end - begin);
+ for (size_t j = begin; j < end; ++j) {
+ results.push_back({result.labels[j], sqrt(result.distances[j])});
+ }
+ std::sort(results.begin(), results.end(),
+ [](const auto& a, const auto& b) { return a.first < b.first; });
+ return results;
+}
+
+std::unique_ptr<doris::segment_v2::IndexSearchResult>
perform_doris_index_range_search(
+ segment_v2::VectorIndex* index, const float* query_vector, float
radius,
+ const segment_v2::IndexSearchParameters& params) {
+ auto result = std::make_unique<doris::segment_v2::IndexSearchResult>();
+ std::ignore = index->range_search(query_vector, radius, params, *result);
+ return result;
+}
+
+float get_radius_from_flatten(const float* vector, int dim,
+ const std::vector<float>& flatten_vectors, float
percentile) {
+ size_t n = flatten_vectors.size() / dim;
+ std::vector<std::pair<size_t, float>> distances(n);
+ for (int i = 0; i < n; i++) {
+ double sum = 0;
+ for (int j = 0; j < dim; j++) {
+ accumulate(flatten_vectors[i * dim + j], flatten_vectors[j], sum);
+ }
+ distances[i] = std::make_pair(i, finalize(sum));
+ }
+ std::sort(distances.begin(), distances.end(),
+ [](const auto& a, const auto& b) { return a.second < b.second;
});
+ // Use the median distance as the radius
+ size_t percentile_index = static_cast<size_t>(n * percentile);
+ float radius = distances[percentile_index].second;
+
+ return radius;
+}
+
+float get_radius_from_matrix(const float* vector, int dim,
+ const std::vector<std::vector<float>>&
matrix_vectors,
+ float percentile) {
+ size_t n = matrix_vectors.size();
+ std::vector<std::pair<size_t, float>> distances(n);
+ for (size_t i = 0; i < n; i++) {
+ double sum = 0;
+ for (int j = 0; j < dim; j++) {
+ accumulate(matrix_vectors[i][j], vector[j], sum);
+ }
+ distances[i] = std::make_pair(i, finalize(sum));
+ }
+ std::sort(distances.begin(), distances.end(),
+ [](const auto& a, const auto& b) { return a.second < b.second;
});
+ // Use the median distance as the radius
+ size_t percentile_index = static_cast<size_t>(n * percentile);
+ float radius = distances[percentile_index].second;
+
+ return radius;
+}
+} // namespace doris::vector_search_utils
\ No newline at end of file
diff --git a/be/test/olap/vector_search/vector_search_utils.h
b/be/test/olap/vector_search/vector_search_utils.h
index b22e01b63b5..8fd79997819 100644
--- a/be/test/olap/vector_search/vector_search_utils.h
+++ b/be/test/olap/vector_search/vector_search_utils.h
@@ -20,6 +20,7 @@
#include <gen_cpp/Descriptors_types.h>
#include <gen_cpp/Exprs_types.h>
#include <gen_cpp/Types_types.h>
+#include <gen_cpp/olap_file.pb.h>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <stdint.h>
@@ -36,12 +37,75 @@
#include "olap/tablet_schema.h"
#include "runtime/descriptors.h"
#include "vec/exprs/vexpr_context.h"
+#include "vector_index.h"
// Add CLucene RAM Directory header
#include <CLucene/store/RAMDirectory.h>
+#include <faiss/MetricType.h>
using doris::segment_v2::DorisCompoundReader;
-namespace doris::vec_search_mock {
+namespace faiss {
+class Index;
+class IndexHNSWFlat;
+} // namespace faiss
+
+namespace doris::segment_v2 {
+class FaissVectorIndex;
+}
+
+namespace doris::vector_search_utils {
+
+// Generate random vectors for testing
+std::vector<float> generate_random_vector(int dim);
+// Generate random vectors in matrix form
+std::vector<std::vector<float>> generate_test_vectors_matrix(int num_vectors,
int dimension);
+// Generate random vectors as a flatten vector
+std::vector<float> generate_test_vectors_flatten(int num_vectors, int
dimension);
+
+// Enum for different index types
+enum class IndexType {
+ FLAT_L2,
+ HNSW,
+ // Add more index types as needed
+};
+std::unique_ptr<faiss::Index> create_native_index(IndexType type, int
dimension, int m);
+std::unique_ptr<doris::segment_v2::VectorIndex> create_doris_index(IndexType
index_type,
+ int
dimension, int m);
+
+// Helper function to add vectors to both Doris and native indexes
+void add_vectors_to_indexes_serial_mode(segment_v2::VectorIndex* doris_index,
+ faiss::Index* native_index,
+ const std::vector<std::vector<float>>&
vectors);
+
+void add_vectors_to_indexes_batch_mode(segment_v2::VectorIndex* doris_index,
+ faiss::Index* native_index, size_t
num_vectors,
+ const std::vector<float>&
flatten_vectors);
+
+void print_search_results(const segment_v2::IndexSearchResult& doris_results,
+ const std::vector<float>& native_distances,
+ const std::vector<faiss::idx_t>& native_indices, int
query_idx);
+
+float get_radius_from_flatten(const float* vector, int dim,
+ const std::vector<float>& flatten_vectors, float
percentile);
+float get_radius_from_matrix(const float* vector, int dim,
+ const std::vector<std::vector<float>>&
matrix_vectors,
+ float percentile);
+// Helper function to compare search results between Doris and native Faiss
+void compare_search_results(const segment_v2::IndexSearchResult& doris_results,
+ const std::vector<float>& native_distances,
+ const std::vector<faiss::idx_t>& native_indices,
+ float abs_error = 1e-5);
+
+// result is a vector of pairs, where each pair contains the labels and
distance
+// result is sorted by labels
+std::vector<std::pair<int, float>>
perform_native_index_range_search(faiss::Index* index,
+ const
float* query_vector,
+ float
radius);
+
+std::unique_ptr<doris::segment_v2::IndexSearchResult>
perform_doris_index_range_search(
+ segment_v2::VectorIndex* index, const float* query_vector, float
radius,
+ const segment_v2::IndexSearchParameters& params);
+
class MockIndexFileReader : public ::doris::segment_v2::IndexFileReader {
public:
MockIndexFileReader()
@@ -83,8 +147,6 @@ public:
~MockAnnIndexIterator() override = default;
- IndexType type() override { return IndexType::ANN; }
-
MOCK_METHOD(Status, read_from_index, (const doris::segment_v2::IndexParam&
param), (override));
MOCK_METHOD(Status, range_search,
(const segment_v2::RangeSearchParams& params,
@@ -95,7 +157,7 @@ public:
private:
io::IOContext _io_ctx_mock;
};
-} // namespace doris::vec_search_mock
+} // namespace doris::vector_search_utils
namespace doris::vectorized {
template <typename T>
@@ -189,7 +251,7 @@ protected:
virtual_slot_ref_node.__isset.opcode = false;
_virtual_slot_ref_expr.nodes.push_back(virtual_slot_ref_node);
- _ann_index_iterator =
std::make_unique<vec_search_mock::MockAnnIndexIterator>();
+ _ann_index_iterator =
std::make_unique<vector_search_utils::MockAnnIndexIterator>();
_row_desc = RowDescriptor(*_desc_tbl, {0}, {false});
@@ -210,7 +272,7 @@ protected:
private:
doris::ObjectPool obj_pool;
RowDescriptor _row_desc;
- std::unique_ptr<vec_search_mock::MockAnnIndexIterator> _ann_index_iterator;
+ std::unique_ptr<vector_search_utils::MockAnnIndexIterator>
_ann_index_iterator;
vectorized::IColumn::MutablePtr _result_column;
doris::TExpr _virtual_slot_ref_expr;
DescriptorTbl* _desc_tbl;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]