mikemccand commented on PR #14178:
URL: https://github.com/apache/lucene/pull/14178#issuecomment-2844751984
I was able to test this PR, yay!
I first installed `faiss-cpu` via the Anaconda `pytorch` channel into my dev
box, then applied the patch diff from this PR, then tweaked luceneutil with
this hackity patch:
```
diff --git a/src/main/knn/KnnGraphTester.java
b/src/main/knn/KnnGraphTester.java
index 327b826c..441e80bb 100644
--- a/src/main/knn/KnnGraphTester.java
+++ b/src/main/knn/KnnGraphTester.java
@@ -46,6 +46,9 @@ import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.function.BinaryOperator;
+// nocommit
+import org.apache.lucene.sandbox.codecs.faiss.FaissKnnVectorsFormat;
+
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
@@ -531,6 +534,9 @@ public class KnnGraphTester {
}
}
+ private void computeIndexStatistics(Path indexPath) throws IOException {
+ }
+
private void printIndexStatistics(Path indexPath) throws IOException {
try (Directory dir = FSDirectory.open(indexPath);
IndexReader reader = DirectoryReader.open(dir)) {
@@ -711,14 +717,12 @@ public class KnnGraphTester {
HnswGraph knnValues;
if (vectorsReader instanceof Lucene99HnswVectorsReader
hnswVectorsReader) {
knnValues = hnswVectorsReader.getGraph(KNN_FIELD);
- } else {
- throw new IllegalStateException("unsupported vectors reader: " +
vectorsReader);
- }
- log("Leaf %d has %d layers\n", context.ord, knnValues.numLevels());
- log("Leaf %d has %d documents\n", context.ord, leafReader.maxDoc());
- printGraphFanout(knnValues, leafReader.maxDoc());
- printGraphConnectedNess(knnValues);
+ log("Leaf %d has %d layers\n", context.ord,
knnValues.numLevels());
+ log("Leaf %d has %d documents\n", context.ord,
leafReader.maxDoc());
+ printGraphFanout(knnValues, leafReader.maxDoc());
+ printGraphConnectedNess(knnValues);
+ }
}
}
}
@@ -1260,6 +1264,8 @@ public class KnnGraphTester {
return new Lucene103Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ // nocommit
+ /*
if (quantize) {
if (quantizeBits == 1) {
return switch (indexType) {
@@ -1272,12 +1278,16 @@ public class KnnGraphTester {
} else {
return new Lucene99HnswVectorsFormat(maxConn, beamWidth,
numMergeWorker, null);
}
+ */
+ return new FaissKnnVectorsFormat("IDMap,HNSW32",
"efConstruction=200,efSearch=150");
}
};
} else {
return new Lucene103Codec() {
@Override
public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ // nocommit
+ /*
if (quantize) {
if (quantizeBits == 1) {
return switch (indexType) {
@@ -1290,6 +1300,8 @@ public class KnnGraphTester {
} else {
return new Lucene99HnswVectorsFormat(maxConn, beamWidth,
numMergeWorker, exec);
}
+ */
+ return new FaissKnnVectorsFormat("IDMap,HNSW32",
"efConstruction=200,efSearch=150");
}
};
}
diff --git a/src/python/benchUtil.py b/src/python/benchUtil.py
index 45949cde..e105cbf7 100644
--- a/src/python/benchUtil.py
+++ b/src/python/benchUtil.py
@@ -1699,6 +1699,10 @@ def getAntClassPath(checkout):
raise RuntimeError("can't find core JAR file in %s" %
("%s/lucene/build/core" % path))
cp.append(core_jar_file)
+
+ # nocommit -- for Faiss Codec wrapper
+ cp.append('/home/mike/miniforge3/lib'
+ )
cp.append("%s/lucene/build/sandbox/classes/java" % path)
cp.append("%s/lucene/build/misc/classes/java" % path)
cp.append("%s/lucene/build/facet/classes/java" % path)
diff --git a/src/python/knnPerfTest.py b/src/python/knnPerfTest.py
index 1e971aee..03d17125 100644
--- a/src/python/knnPerfTest.py
+++ b/src/python/knnPerfTest.py
@@ -82,7 +82,7 @@ PARAMS = {
"quantizeCompress": (True,),
# "indexType": ("flat", "hnsw"), # index type, only works with singlt bit
"queryStartIndex": (0,), # seek to this start vector before searching,
to sample different vectors
- # "forceMerge": (True, False),
+ #"forceMerge": (True,),
#'niter': (10,),
}
@@ -132,6 +132,10 @@ def run_knn_benchmark(checkout, values):
jfr_output = f"{constants.LOGS_DIR}/knn-perf-test.jfr"
cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout) +
(f"{constants.BENCH_BASE_DIR}/build",))
+
+ # nocommit -- must use JAR so SPI can find the codec:
+ cp =
'/l/faiss/lucene/sandbox/build/libs/lucene-sandbox-11.0.0-SNAPSHOT.jar:' + cp
+
cmd = constants.JAVA_EXE.split(" ") + [
"-cp",
cp,
@@ -143,6 +147,9 @@ def run_knn_benchmark(checkout, values):
"-XX:+DebugNonSafepoints",
]
+ # nocommit
+
cmd.append('-Djava.library.path=/home/mike/miniforge3/envs/faiss_lucene_codec/lib:/home/mike/miniforge3/lib/:/usr/java/packages/lib:/usr/lib64:/lib64:/lib:/usr/lib')
+
if DO_PROFILING:
cmd +=
[f"-XX:StartFlightRecording=dumponexit=true,maxsize=250M,settings={constants.BENCH_BASE_DIR}/src/python/profiling.jfc"
+ f",filename={jfr_output}"]
@@ -194,7 +201,7 @@ def run_knn_benchmark(checkout, values):
str(dim),
"-docs",
doc_vectors,
- "-reindex",
+ #"-reindex",
"-search-and-stats",
query_vectors,
"-numIndexThreads",
```
And then I was able to do a quick smoke test with Faiss, yay!
With single-segment index (this is `Cohere/wikipedia-22-12-en-embeddings`
768 dim vectors):
```
Results:
recall latency(ms) nDoc topK fanout maxConn beamWidth quantized
index(s) index_docs/s force_merge(s) num_segments index_size(MB)
vec_disk(MB) vec_RAM(MB) indexType
0.887 0.849 500000 100 50 64 250 no
265.79 1881.21 150.02 1 3065.21
1464.844 1464.844 HNSW
```
and multi (11) segment index:
```
Results:
recall latency(ms) nDoc topK fanout maxConn beamWidth quantized
index(s) index_docs/s num_segments index_size(MB) vec_disk(MB) vec_RAM(MB)
indexType
0.970 6.506 500000 100 50 64 250 no
231.42 2160.54 11 3065.23 1464.844 1464.844
HNSW
```
This is with `new FaissKnnVectorsFormat("IDMap,HNSW32",
"efConstruction=200,efSearch=150");` -- meaning (translating to Lucene HNSW
world) I think: `maxConn=32`, `beamWidth=200`, `topK+fanout=150`, so,
`topK=100` and `fanout=50`.
It's also curious how much slower the 11 segment case is (6.5 vs .9 msec) --
maybe the search is not concurrent across segments?
Also, it's annoying that recall always gets better with multiple segments
(same is true of Lucene HNSW I think?) -- this is a leaky abstraction. Ideally
change to underlying segment geometry should not have such a big impact on the
returned hits.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]