This is an automated email from the ASF dual-hosted git repository. spmallette pushed a commit to branch TINKERPOP-3158 in repository https://gitbox.apache.org/repos/asf/tinkerpop.git
commit 8bd6f8a3cf88137f7f838e7ba65cb1fc17a2427f Author: Stephen Mallette <[email protected]> AuthorDate: Tue May 6 12:56:08 2025 -0400 docs and imports --- CHANGELOG.asciidoc | 4 +- .../reference/implementations-tinkergraph.asciidoc | 162 +++++++++++++++++++++ docs/src/upgrade/release-3.8.x.asciidoc | 0 .../jsr223/TinkerGraphGremlinPlugin.java | 8 + 4 files changed, 173 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index e312807286..fd480a703e 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -25,6 +25,8 @@ image::https://raw.githubusercontent.com/apache/tinkerpop/master/docs/static/ima [[release-4-0-0]] === TinkerPop 4.0.0 (Release Date: NOT OFFICIALLY RELEASED YET) +* Added vector search to TinkerGraph. +* Renamed the regex based seach service in TinkerGraph to `tinker.search.text`. * Added typed numeric wrappers and `preciseNumbers` connection option to `gremlin-javascript` for explicit control over numeric type serialization and deserialization. * Added `NextN(n)` to `Traversal` in `gremlin-go` for batched result iteration, providing API parity with `next(n)` in the Java, Python, and .NET GLVs, and updated the Go translators in `gremlin-core` and `gremlin-javascript` to emit `NextN(n)` for the batched form. * Added Gremlator, a single page web application, that translates Gremlin into various programming languages like Javascript and Python. @@ -83,7 +85,7 @@ image::https://raw.githubusercontent.com/apache/tinkerpop/master/docs/static/ima * Replace `WebSocket` with `HTTP` (non-streaming) in `gremlin-dotnet`. * Added `MimeType` to `IMessageSerializer` and split client option to allow separate request and response serialization in `gremlin-dotnet`. * Added `RequestInterceptor` to `gremlin-dotnet` with `auth` reference implementations. -* Added streaming deserialization to `gremlin-dotnet`, results are now deserialized incrementally from the HTTP response stream via `IAsyncEnumerable<T>`. +* Added streaming deserialization to `gremlin-dotnet`, results are now deserialized incrementally from the HTTP response stream via `IAsyncEnumerable<T>`. * Target framework for `gremlin-dotnet` updated to `net8.0`. * Bumped minimum Node.js version for `gremlin-javascript` from 20 to 22. * Removed `mimeType` connection option in `gremlin-javascript` in favor of direct use of `reader` and `writer` options. diff --git a/docs/src/reference/implementations-tinkergraph.asciidoc b/docs/src/reference/implementations-tinkergraph.asciidoc index 41e3e9b451..c5aeeafa49 100644 --- a/docs/src/reference/implementations-tinkergraph.asciidoc +++ b/docs/src/reference/implementations-tinkergraph.asciidoc @@ -219,6 +219,168 @@ g.io("data/tinkerpop-crew.kryo").read().iterate() g.V().properties() ---- +=== Indices + +Indices are a critical component of graph databases that significantly improve query performance by allowing for quick +lookups of elements based on their properties. Without indices, graph traversals would need to scan through all +elements to find those matching specific criteria, which becomes increasingly inefficient as the graph grows in size. + +TinkerGraph supports two types of indices: + +1. *Key-based Indices*: Traditional indices that enable fast lookups of vertices or edges based on property values. +2. *Vector Indices*: Specialized indices for similarity search that find elements with vector embeddings similar to a +query vector. + +==== Key-based Indices + +Key-based indices are the standard way to improve lookup performance in TinkerGraph. They're simple to create and use: + +[gremlin-groovy] +---- +graph.createIndex("name", Vertex.class) <1> +g.addV("person").property("name", "marko").property("age", 29).iterate() +g.addV("person").property("name", "vadas").property("age", 27).iterate() +g.addV("person").property("name", "josh").property("age", 32).iterate() +g.V().has("name", "marko").next() <2> +---- + +<1> Create an index on the "name" property for vertices. +<2> Query using the index - this will be much faster than a full scan. + +You can also create indices for edge properties: + +[gremlin-groovy] +---- +graph.createIndex("weight", Edge.class) <1> +marko = g.V().has("name", "marko").next() +vadas = g.V().has("name", "vadas").next() +g.addE("knows").from(marko).to(vadas).property("weight", 0.5).iterate() +g.E().has("weight", 0.5).next() <2> +---- + +<1> Create an index on the "weight" property for edges. +<2> Query using the index - this will be much faster than a full scan. + +To check which properties are indexed for a particular element class: + +[gremlin-groovy] +---- +graph.getIndexedKeys(Vertex.class) <1> +graph.getIndexedKeys(Edge.class) <2> +---- + +<1> Get all indexed keys for vertices. +<2> Get all indexed keys for edges. + +To drop an index when it's no longer needed: + +[gremlin-groovy] +---- +graph.dropIndex("name", Vertex.class) +---- + +==== Vector Indices + +Vector indices enable similarity search based on vector embeddings. Vector search allows you to find elements that are +conceptually similar rather than just matching on exact property values. Vector search works by representing elements as +points in a high-dimensional space, where proximity indicates similarity. TinkerGraph implements vector search using +HNSW (Hierarchical Navigable Small World), an algorithm for approximate nearest neighbor search that provides excellent +performance for high-dimensional vector data. + +To use vector indices in TinkerGraph, you need to: + +1. Create a vector index with a specified dimension +2. Add elements with vector embeddings +3. Perform searches using either direct graph methods or the `call()` step + +Here's a complete example: + +[gremlin-groovy] +---- +graph.getServiceRegistry().registerService(new TinkerVectorSearchFactory(graph)) <1> +indexConfig = [:] +indexConfig.put(TinkerVectorIndex.CONFIG_DIMENSION, 3) <2> +graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, indexConfig) +g.addV("person").property("name", "Alice").property("embedding", new float[]{1.0f, 0.0f, 0.0f}).iterate() +g.addV("person").property("name", "Bob").property("embedding", new float[]{0.0f, 1.0f, 0.0f}).iterate() +g.addV("person").property("name", "Charlie").property("embedding", new float[]{0.0f, 0.0f, 1.0f}).iterate() +g.addV("person").property("name", "Dave").property("embedding", new float[]{0.9f, 0.1f, 0.0f}).iterate() +params = [key: "embedding", topK: 2] <3> +g.V().has("name", "Alice").call("tinker.search.vector.topKByElement", params).toList() +---- + +<1> Register the vector search service. +<2> Configuration for the vector index that defines the embedding dimension of size 3. +<3> Specify the property key containing the embedding and number of results to return. + +The `call()` step returns a list of maps, each containing: + +* `distance`: The similarity score between the query vector and the result +* `element`: The vertex or edge that matches the query + +Vector indices can also be created for edges: + +[gremlin-groovy] +---- +graph.createIndex(TinkerIndexType.VECTOR, "embedding", Edge.class, indexConfig) +alice = g.V().has("name", "Alice").next() +bob = g.V().has("name", "Bob").next() +charlie = g.V().has("name", "Charlie").next() +g.addE("knows").from(alice).to(bob).property("embedding", new float[]{1.0f, 0.0f, 0.0f}).iterate() +g.addE("knows").from(bob).to(charlie).property("embedding", new float[]{0.0f, 1.0f, 0.0f}).iterate() +params = [key: "embedding"] +g.E().has("embedding", new float[]{1.0f, 0.0f, 0.0f}).call("tinker.search.vector.topKByElement", params).toList() +---- + +TinkerGraph supports various distance functions for vector similarity search: + +* `COSINE`: Measures the cosine of the angle between two vectors (default) +* `EUCLIDEAN`: Measures the straight-line distance between two points +* `MANHATTAN`: Measures the sum of absolute differences between coordinates +* `INNER_PRODUCT`: Measures the dot product of two vectors +* `BRAY_CURTIS`: Measures the Bray-Curtis dissimilarity +* `CANBERRA`: Measures the weighted Manhattan distance +* `CORRELATION`: Measures the correlation distance + +You can specify the distance function when creating the vector index: + +[gremlin-groovy] +---- +indexConfig = [:] +indexConfig.put(TinkerVectorIndex.CONFIG_DIMENSION, 3) +indexConfig.put(TinkerVectorIndex.CONFIG_DISTANCE_TYPE, TinkerIndexType.Vector.EUCLIDEAN) +graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, indexConfig) +---- + +There are several configuration options that can be used to fine-tune the performance and behavior of vector indices. +These options are specified when creating a vector index: + +[width="100%",cols="2,5,2",options="header"] +|========================================================= +|Configuration Option |Description |Default Value +|`CONFIG_DIMENSION` |The dimension of the vector embeddings. This is a required parameter and must match the length of the vector embeddings stored in the graph. |N/A (Required) +|`CONFIG_DISTANCE_TYPE` |The distance function to use for similarity calculations. Must be one of the `TinkerIndexType.Vector` enum values (COSINE, EUCLIDEAN, MANHATTAN, INNER_PRODUCT, BRAY_CURTIS, CANBERRA, CORRELATION). |COSINE +|`CONFIG_M` |The maximum number of connections per node in the HNSW graph. Higher values provide better search quality at the cost of increased memory usage and index build time. |16 +|`CONFIG_EF_CONSTRUCTION` |The size of the dynamic candidate list during index construction. Higher values improve index quality at the cost of longer build times. |200 +|`CONFIG_EF` |The size of the dynamic candidate list during search. Higher values improve search accuracy at the cost of slower search times. |10 +|`CONFIG_MAX_ITEMS` |The maximum number of items that can be stored in the index. |100 +|========================================================= + +Here's an example of creating a vector index with custom configuration options: + +[gremlin-groovy] +---- +// create a vector index with custom configuration +indexConfig = [:] +indexConfig.put(TinkerVectorIndex.CONFIG_DIMENSION, 128) +indexConfig.put(TinkerVectorIndex.CONFIG_DISTANCE_TYPE, TinkerIndexType.Vector.COSINE) +indexConfig.put(TinkerVectorIndex.CONFIG_M, 32) +indexConfig.put(TinkerVectorIndex.CONFIG_EF_CONSTRUCTION, 300) +indexConfig.put(TinkerVectorIndex.CONFIG_EF, 20) +indexConfig.put(TinkerVectorIndex.CONFIG_MAX_ITEMS, 1000) +graph.createIndex(TinkerIndexType.VECTOR, "embedding", Vertex.class, indexConfig) +---- + [[tinkergraph-gremlin-tx]] === Transactions diff --git a/docs/src/upgrade/release-3.8.x.asciidoc b/docs/src/upgrade/release-3.8.x.asciidoc new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/jsr223/TinkerGraphGremlinPlugin.java b/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/jsr223/TinkerGraphGremlinPlugin.java index 007db18352..b346cc570f 100644 --- a/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/jsr223/TinkerGraphGremlinPlugin.java +++ b/tinkergraph-gremlin/src/main/java/org/apache/tinkerpop/gremlin/tinkergraph/jsr223/TinkerGraphGremlinPlugin.java @@ -28,6 +28,10 @@ import org.apache.tinkerpop.gremlin.tinkergraph.process.computer.TinkerMemory; import org.apache.tinkerpop.gremlin.tinkergraph.process.computer.TinkerMessenger; import org.apache.tinkerpop.gremlin.tinkergraph.process.computer.TinkerReduceEmitter; import org.apache.tinkerpop.gremlin.tinkergraph.process.computer.TinkerWorkerPool; +import org.apache.tinkerpop.gremlin.tinkergraph.services.TinkerDegreeCentralityFactory; +import org.apache.tinkerpop.gremlin.tinkergraph.services.TinkerServiceRegistry; +import org.apache.tinkerpop.gremlin.tinkergraph.services.TinkerTextSearchFactory; +import org.apache.tinkerpop.gremlin.tinkergraph.services.TinkerVectorSearchFactory; import org.apache.tinkerpop.gremlin.tinkergraph.structure.TinkerEdge; import org.apache.tinkerpop.gremlin.tinkergraph.structure.TinkerElement; import org.apache.tinkerpop.gremlin.tinkergraph.structure.TinkerFactory; @@ -59,6 +63,10 @@ public final class TinkerGraphGremlinPlugin extends AbstractGremlinPlugin { TinkerIoRegistryV2.class, TinkerIoRegistryV3.class, TinkerIoRegistryV4.class, + TinkerVectorSearchFactory.class, + TinkerTextSearchFactory.class, + TinkerDegreeCentralityFactory.class, + TinkerServiceRegistry.TinkerServiceFactory.class, TinkerProperty.class, TinkerVertex.class, TinkerVertexProperty.class,
