branch: externals/vecdb commit 1d614ad36db09dfc035e912cac345164e0eff76a Merge: 7362e46ae3 e28ccf4be4 Author: Andrew Hyatt <ahy...@gmail.com> Commit: GitHub <nore...@github.com>
Merge pull request #1 from ahyatt/chroma-integration-tests Chroma integration tests and various fixes --- README.org | 11 +-- vecdb-chroma.el | 60 ++++++++----- vecdb-integration-test.el | 217 ++++++++++++++++++++++++++++++++++++++++++++++ vecdb-qdrant.el | 21 +++-- vecdb.el | 3 +- 5 files changed, 277 insertions(+), 35 deletions(-) diff --git a/README.org b/README.org index 7c010e6410..2c9fbb2a25 100644 --- a/README.org +++ b/README.org @@ -30,15 +30,16 @@ Collections must be created before they can be used with ~vecdb-create~, and ~ve They can also be deleted with ~vecdb-delete~. * Adding and replacing data -Before data is queried, it must be added. This is done via a batch operation on -a group of data, ~vecdb-upsert-items~. This either creates an item in the collection -or replaces it, based on the =id= of the item. Here's an example of adding or -replacing one item: +Before data is queried, it must be added. This is done via a batch operation on +a group of data, ~vecdb-upsert-items~. This either creates an item in the +collection or replaces it, based on the =id= of the item. =id= should be an integer +or a string UUID. Seeing as how emacs does not provide a UUID library, probably +an integer is the best choice. #+begin_src emacs-lisp (vecdb-upsert-items my-vecdb-provider my-vecdb-collection (list (make-vecdb-item - :id "example-id" + :id 91 :vector [0.1 0.2 0.3 0.4] :payload '(:my-id "235913926")))) #+end_src diff --git a/vecdb-chroma.el b/vecdb-chroma.el index 3c9807bc3c..3343089214 100644 --- a/vecdb-chroma.el +++ b/vecdb-chroma.el @@ -103,14 +103,16 @@ SYNC indicates whether the request should be synchronous." (format "/api/v2/tenants/%s/databases" (vecdb-chroma-provider-tenant provider)) `(:name ,(vecdb-chroma-provider-database provider)) t)) - (vecdb-chroma-call - provider - 'post - (format "/api/v2/tenants/%s/databases/%s/collections" - (vecdb-chroma-provider-tenant provider) - (vecdb-chroma-provider-database provider)) - `(:name ,(vecdb-collection-name collection)) - t)) + (let ((result (vecdb-chroma-call + provider + 'post + (format "/api/v2/tenants/%s/databases/%s/collections" + (vecdb-chroma-provider-tenant provider) + (vecdb-chroma-provider-database provider)) + `(:name ,(vecdb-collection-name collection)) + t))) + (puthash (vecdb-collection-name collection) + (plist-get result :id) vecdb-chroma-collection-id-cache))) (cl-defmethod vecdb-delete ((provider vecdb-chroma-provider) (collection vecdb-collection)) @@ -145,17 +147,21 @@ SYNC indicates whether the request should be synchronous." (and (vecdb-chroma-has-tenant-p provider) (vecdb-chroma-has-database-p provider (vecdb-chroma-provider-database provider)) - (condition-case nil + (condition-case err (vecdb-chroma-call provider 'get (format "/api/v2/tenants/%s/databases/%s/collections/%s" (vecdb-chroma-provider-tenant provider) (vecdb-chroma-provider-database provider) - (vecdb-chroma-collection-id provider collection)) + ;; Although the docs say this could be the ID, what + ;; actually works in practice is only the name. + (vecdb-collection-name collection)) nil t) - (plz-error nil)))) + (plz-error (if (eq 404 (plz-response-status (plz-error-response (nth 2 err)))) + nil + (error "Error checking tenant: %s" (plz-error-message err))))))) (cl-defmethod vecdb-upsert-items ((provider vecdb-chroma-provider) (collection vecdb-collection) @@ -170,7 +176,7 @@ SYNC indicates whether the request should be synchronous." 'post url `(:embeddings ,(apply #'vector (mapcar #'vecdb-item-vector items)) - :ids ,(apply #'vector (mapcar #'vecdb-item-id items)) + :ids ,(apply #'vector (mapcar (lambda (i) (format "%s" (vecdb-item-id i))) items)) :metadatas ,(apply #'vector (mapcar #'vecdb-item-payload items))) sync))) @@ -182,15 +188,18 @@ SYNC indicates whether the request should be synchronous." (vecdb-chroma-provider-tenant provider) (vecdb-chroma-provider-database provider) (vecdb-chroma-collection-id provider collection))) - (result (vecdb-chroma-call provider 'get url - `(:ids ,(vector item-id) - :limit 1) + (result (vecdb-chroma-call provider 'post url + `(:ids ,(vector (format "%s" item-id)) + :limit 1 + :include ["embeddings" "metadatas"]) t))) - (unless (= (length (plist-get result :items)) 1) - (error "Expected exactly one item, got %d" - (length (plist-get result :items))) + (when (> (length (plist-get result :ids)) 1) + (error "Chroma returned multiple items for ID %s: %s" + item-id (plist-get result :ids))) + (when (= (length (plist-get result :ids)) 1) (make-vecdb-item - :id (aref (plist-get result :ids) 0) + :id (vecdb-chroma--normalize-stored-id + (aref (plist-get result :ids) 0)) :vector (aref (plist-get result :embeddings) 0) :payload (aref (plist-get result :metadatas) 0))))) @@ -206,9 +215,17 @@ SYNC indicates whether the request should be synchronous." provider 'post url - `(:ids ,(apply #'vector item-ids)) + `(:ids ,(apply #'vector (mapcar (lambda (id) (format "%s" id)) + item-ids))) sync))) +(defun vecdb-chroma--normalize-stored-id (id) + "From a string ID, return an integer ID if it is numeric. +If the the ID is a uuid, return it as a string." + (if (string-match-p "^[0-9]+$" id) + (string-to-number id) + id)) + (cl-defmethod vecdb-search-by-vector ((provider vecdb-chroma-provider) (collection vecdb-collection) vector &optional limit) @@ -224,7 +241,8 @@ SYNC indicates whether the request should be synchronous." t))) (cl-loop for i from 0 below (length (aref (plist-get result :ids) 0)) collect (make-vecdb-item ; - :id (aref (aref (plist-get result :ids) 0) i) + :id (vecdb-chroma--normalize-stored-id + (aref (aref (plist-get result :ids) 0) i)) :vector (aref (aref (plist-get result :embeddings) 0) i) :payload (aref (aref (plist-get result :metadatas) 0) i))))) diff --git a/vecdb-integration-test.el b/vecdb-integration-test.el new file mode 100644 index 0000000000..755e0702e1 --- /dev/null +++ b/vecdb-integration-test.el @@ -0,0 +1,217 @@ +;;; vecdb-integration-test.el --- Integration tests for vecdb -*- lexical-binding: t; -*- + +;; Copyright (C) 2025 Google LLC +;; +;; Author: Andrew Hyatt <ahy...@gmail.com> +;; Maintainer: Andrew Hyatt <ahy...@gmail.com> +;; +;; This file is not part of GNU Emacs. +;; +;; Licensed under the Apache License, Version 2.0 (the "License"); +;; you may not use this file except in compliance with the License. +;; You may obtain a copy of the License at +;; +;; http://www.apache.org/licenses/LICENSE-2.0 +;; +;; Unless required by applicable law or agreed to in writing, software +;; distributed under the License is distributed on an "AS IS" BASIS, +;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +;; See the License for the specific language governing permissions and +;; limitations under the License. +;; +;;; Commentary: +;; +;; This file contains integration tests for the `vecdb' package, +;; supporting multiple backends like Chroma and Qdrant. +;; Tests for specific backends run if corresponding environment variables are set. +;; +;; To run these tests manually: +;; 1. Ensure the relevant vector database (Chroma and/or Qdrant) is running. +;; 2. Set the necessary environment variables: +;; For Chroma: CHROMA_URL (e.g., "http://localhost:8000") +;; CHROMA_TENANT (optional, defaults to "default") +;; CHROMA_DATABASE (optional, defaults to "default") +;; For Qdrant: QDRANT_URL (e.g., "http://localhost:6333") +;; QDRANT_API_KEY (e.g., "your-api-key") +;; 3. Execute from the command line: +;; emacs -batch -l ert -l vecdb-integration-test.el -f ert-run-tests-batch-and-exit +;; +;; The tests for a specific provider will be skipped if its required +;; environment variables are not set. If no providers are configured, +;; all tests will be skipped. +;; +;;; Code: + +(require 'ert) +(require 'vecdb) +(require 'vecdb-chroma) +(require 'vecdb-qdrant) +(require 'cl-lib) ;; For cl-remove-if-not, cl-every + +(declare-function chroma-ext--tmp-project-dir "ext-chroma") + +;;; Helper Functions + +(defun vecdb-test--get-providers () + "Initialize and return a list of available `vecdb-provider' structs. +Reads configuration from environment variables. +Skips tests if no providers are configured." + (let ((providers (list))) + ;; ChromaDB Configuration + (let ((chroma-url (getenv "CHROMA_URL"))) + (when chroma-url + (add-to-list 'providers + (make-vecdb-chroma-provider + :url chroma-url + :tenant (or (getenv "CHROMA_TENANT") "default") + :database (or (getenv "CHROMA_DATABASE") "default")) + t))) + + ;; Qdrant Configuration + (let ((qdrant-url (getenv "QDRANT_URL"))) + (when qdrant-url + (let ((qdrant-api-key (getenv "QDRANT_API_KEY"))) + (if qdrant-api-key + (add-to-list 'providers + (make-vecdb-qdrant-provider + :url qdrant-url + :api-key qdrant-api-key) + t) + (warn "QDRANT_URL is set, but QDRANT_API_KEY is missing. Qdrant provider will not be configured."))))) + + (when (null providers) + (ert-skip "No vector database provider environment variables set. (CHROMA_URL or QDRANT_URL must be set)")) + providers)) + +(defmacro vecdb-test--deftest-for-providers (base-name body-function &optional docstring) + "Define `ert-deftest` forms for BASE-NAME against Chroma and Qdrant providers. +BODY-FUNCTION is a symbol of a function that takes one argument (the provider). +Docstring is optional, a default will be generated. + +Each generated test will individually skip if its specific provider +is not found in the list returned by `vecdb-test--get-providers` (which +itself might globally skip if no providers at all are configured)." + (declare (indent defun)) + (let ((chroma-test-name (intern (format "%s-chroma" base-name))) + (qdrant-test-name (intern (format "%s-qdrant" base-name))) + (base-doc (or docstring (format "Test %s for a vector database provider." base-name)))) + `(progn + (ert-deftest ,chroma-test-name () + ,(format "%s (Chroma)" base-doc) + (interactive) ; Keep interactive for manual runs if desired + (let ((current-provider (cl-find-if (lambda (p) (eq (type-of p) 'vecdb-chroma-provider)) + (vecdb-test--get-providers)))) + (if current-provider + (funcall ,body-function current-provider) + (ert-skip (format "Chroma provider not configured for %s" ',chroma-test-name))))) + + (ert-deftest ,qdrant-test-name () + ,(format "%s (Qdrant)" base-doc) + (interactive) + (let ((current-provider (cl-find-if (lambda (p) (eq (type-of p) 'vecdb-qdrant-provider)) + (vecdb-test--get-providers)))) + (if current-provider + (funcall ,body-function current-provider) + (ert-skip (format "Qdrant provider not configured for %s" ',qdrant-test-name)))))))) + +(defmacro with-test-collection (current-provider collection-var collection-name-base options &rest body) + "Execute BODY with COLLECTION-VAR bound to a new collection. +CURRENT-PROVIDER is the provider instance. +COLLECTION-VAR is the symbol to bind the collection struct to. +COLLECTION-NAME-BASE is the base string for the collection name. +OPTIONS is a plist, e.g., '(:vector-size 3). +The full collection name is generated by appending the provider's name. +The collection is created before BODY and deleted afterwards." + (declare (indent 1) (debug t)) + (let ((full-collection-name (gensym "full-collection-name-")) + (vector-size-val (gensym "vector-size-")) + (default-vector-size 3)) + `(let* ((,full-collection-name (format "%s-%s" ,collection-name-base (vecdb-provider-name ,current-provider))) + (,vector-size-val (or (plist-get ,options :vector-size) ,default-vector-size)) + (,collection-var (make-vecdb-collection :name ,full-collection-name :vector-size ,vector-size-val))) + (unwind-protect + (progn + (vecdb-create ,current-provider ,collection-var) + ,@body) + ;; Ensure cleanup even if vecdb-create fails or body errors + (if (vecdb-exists ,current-provider ,collection-var) + (vecdb-delete ,current-provider ,collection-var) + (message "Collection %s did not exist or was already deleted during cleanup." ,full-collection-name)))))) + +;;; Test cases Body Functions + +(defun vecdb-test-create-exists-delete-collection-body (current-provider) + "Core logic for testing create, exists, and delete collection." + (let* ((collection-name-base "test-collection-ced") + (collection-name (format "%s-%s" collection-name-base (vecdb-provider-name current-provider))) + (collection (make-vecdb-collection :name collection-name :vector-size 3))) + (unwind-protect + (progn + (vecdb-create current-provider collection) + (should (vecdb-exists current-provider collection)) + (vecdb-delete current-provider collection) + (should-not (vecdb-exists current-provider collection))) + ;; Cleanup in case of error during assertions + (when (vecdb-exists current-provider collection) + (vecdb-delete current-provider collection))))) + +(vecdb-test--deftest-for-providers vecdb-test-create-exists-delete-collection + #'vecdb-test-create-exists-delete-collection-body + "Test `vecdb-create', `vecdb-exists', and `vecdb-delete'.") + +(defun vecdb-test-upsert-get-delete-items-body (current-provider) + "Core logic for testing upsert and get items." + (let* ((collection-name "test-collection-ug") + (vector-size 3) + (items (list + (make-vecdb-item :id 1 :vector [0 1 2] :payload '(:val 1)) + (make-vecdb-item :id 2 :vector [0 1 2] :payload '(:val 2)) + (make-vecdb-item :id 3 :vector [0 1 2] :payload '(:val 3))))) + (with-test-collection current-provider current-collection collection-name `(:vector-size ,vector-size) + (vecdb-upsert-items current-provider current-collection items t) + (dolist (item items) + (let ((retrieved-item (vecdb-get-item current-provider current-collection (vecdb-item-id item)))) + (should retrieved-item) + (should (equal (vecdb-item-id item) (vecdb-item-id retrieved-item))) + ;; We don't test to see if the vector is equal, + ;; because it could be normalized. + (should (equal (vecdb-item-payload item) (vecdb-item-payload retrieved-item))))) + (vecdb-delete-items current-provider current-collection (mapcar #'vecdb-item-id items) t) + (dolist (item items) + (should-not (vecdb-get-item current-provider current-collection (vecdb-item-id item))))))) + +(vecdb-test--deftest-for-providers vecdb-test-upsert-get-delete-items + #'vecdb-test-upsert-get-delete-items-body + "Test `vecdb-upsert-items', `vecdb-get-item' and `vecdb-delete-items'.") + +(defun vecdb-test-search-by-vector-body (current-provider) + "Core logic for testing search by vector." + (let* ((collection-name "test-collection-sv") + (vector-size 3) + (item1 (make-vecdb-item :id 1 :vector [0.1 0.2 0.3] :payload '(:val 1))) + (item2 (make-vecdb-item :id 2 :vector [0.4 0.5 0.6] :payload '(:val 2))) + (item3 (make-vecdb-item :id 3 :vector [0.7 0.8 0.9] :payload '(:val 3))) + (items (list item1 item2 item3))) + (with-test-collection current-provider current-collection collection-name `(:vector-size ,vector-size) + (vecdb-upsert-items current-provider current-collection items t) + ;; Search for a vector similar to item2 + (let ((results (vecdb-search-by-vector current-provider current-collection [0.41 0.51 0.61] 3))) + (should (= (length results) 3)) + ;; Item2 should be the closest + (let ((first-result (car results))) + (should (equal (vecdb-item-id first-result) (vecdb-item-id item2))) + (should (equal (vecdb-item-payload first-result) (vecdb-item-payload item2)))) + ;; Check if all original items are present in results (order might vary beyond the first) + (should (cl-every (lambda (item) + (cl-find-if (lambda (res-item) (equal (vecdb-item-id item) (vecdb-item-id res-item))) + results)) + items)))))) + +(vecdb-test--deftest-for-providers + vecdb-test-search-by-vector + #'vecdb-test-search-by-vector-body + "Test `vecdb-search-by-vector'.") + +(provide 'vecdb-integration-test) + +;;; vecdb-integration-test.el ends here diff --git a/vecdb-qdrant.el b/vecdb-qdrant.el index 65cdf38656..d6e35b0a14 100644 --- a/vecdb-qdrant.el +++ b/vecdb-qdrant.el @@ -106,14 +106,19 @@ SYNC indicates whether the request should be synchronous." (collection vecdb-collection) id) "Get items with ID from the COLLECTION with PROVIDER." - (let ((result (vecdb-qdrant-call provider 'get - (concat "/collections/" (vecdb-collection-name collection) "/points/" id)))) - (when result - (let ((point (plist-get result :result))) - (when point - (make-vecdb-item :id (plist-get point :id) - :vector (plist-get point :vector) - :payload (plist-get point :payload))))))) + (condition-case err + (let ((result (vecdb-qdrant-call provider 'get + (format "/collections/%s/points/%s" (vecdb-collection-name collection) id) nil t))) + (when result + (let ((point (plist-get result :result))) + (when point + (make-vecdb-item :id (plist-get point :id) + :vector (plist-get point :vector) + :payload (plist-get point :payload)))))) + (plz-error + (if (eq 404 (plz-response-status (plz-error-response (nth 2 err)))) + nil + (error "Error retrieving item: %s" (plz-error-message err)))))) (cl-defmethod vecdb-delete-items ((provider vecdb-qdrant-provider) (collection vecdb-collection) diff --git a/vecdb.el b/vecdb.el index 6c94bb7d63..78179c804b 100644 --- a/vecdb.el +++ b/vecdb.el @@ -75,7 +75,8 @@ An ID may be an integer or a string, and is used to uniquely identify the item." (list "vecdb-upsert not implemented for" (vecdb-provider-name provider)))) (cl-defgeneric vecdb-get-item (provider collection id) - "Get items with ID from the COLLECTION with PROVIDER." + "Get items with ID from the COLLECTION with PROVIDER. +If the item does not exist, return `nil'." (ignore collection id) (signal 'not-implemented (list "vecdb-get not implemented for" (vecdb-provider-name provider))))