branch: externals/vecdb
commit fcf54a3a8e385be060c100dbb10995c92bb96607
Author: Andrew Hyatt <ahy...@gmail.com>
Commit: Andrew Hyatt <ahy...@gmail.com>

    Add chroma provider support to embed-db
---
 README.org      |  36 ++++++++-
 embed-chroma.el | 230 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 264 insertions(+), 2 deletions(-)

diff --git a/README.org b/README.org
index 28b3392d0c..a89fbc74cb 100644
--- a/README.org
+++ b/README.org
@@ -7,20 +7,21 @@ This package doesn't provide end-user functionality on its 
own; it is designed t
 
 The package does not provide embeddings, that can be done with the 
[[https://github.com/ahyatt/llm][llm]] package, or any source of embeddings.
 * Configuring the collection
-There are two concepts that together define a collection database of 
embeddings: the /provider/, and the /collection/.  The provider is what kind of 
backend we are using, such as =sqlite-vec=, or =qdrant=.  This is a struct 
defined by the exact provider you want to use.
+There are two concepts that together define a collection database of 
embeddings: the /provider/, and the /collection/.  The provider is what kind of 
backend we are using, right now either =chroma=, or =qdrant=.  This is a struct 
defined by the exact provider you want to use.
 
 The collection is, for that provider, what exact database is getting used, 
with each collection having its own separate data.  Collections must be created 
before being used.  The collection is defined by the struct 
~embed-db-collection~ which has a ~name~ (used to identify the collection), 
~vector-size~, and ~payload-fields~.  The ~vector-size~ will be based on the 
size of the embedding vector from your provider.  1536 is what Open AI uses.  
~payload-fields~ is an alist of fields and the [...]
 
 An example, putting it all together, is:
 
 #+begin_src emacs-lisp
-(defvar my-embed-provider (make-embed-qdrant-provider :api-key 
my-qdrant-api-key :url my-drant-url))
+(defvar my-embed-provider (make-embed-qdrant-provider :api-key 
my-qdrant-api-key :url my-qdrant-url))
 (defvar my-embed-collection (make-embed-db-collection :name "my test 
collection" :vector-size 1536 :payload-fields (('my-id . 'string))))
 #+end_src
 
 The provider will be supplied by the end-user, specifying how they want things 
stored, and any data necessary for that storage and retrieval to function.  The 
collection is typically partially supplied by the application, with the 
possible exception of embedding size, which may be dependent on the exact 
embedding provider they are using.
 
 Collections must be created before they can be used with ~embed-db-create~, 
and ~embed-db-exists~ can return whether the collection exists.
+
 #+begin_src emacs-lisp
 (unless (embed-db-exists my-embed-provider my-embed-collection)
   (embed-db-create my-embed-provider my-embed-collection))
@@ -47,9 +48,40 @@ These can be deleted with ~embed-db-delete-item~ and 
retrieved by ID with ~embed
 IDs used in =embed-db= *must* be =uint64= values.  If you have another ID you 
need to use to tie it together with other storage, that should go into the 
=payload=.
 * Querying data
 Querying the database can be done with ~embed-db-search-by-vector~, passing it 
a vector and optionally a number of results to return (10 is the default).
+
 #+begin_src emacs-lisp
 (embed-db-search-by-vector my-embed-provider my-embed-collection [0.3 0.1 0.5 
-0.9] 20)
 #+end_src
 
 This will return the specifies number of =embed-db-item= structs, with the 
payloads they were stored with.
+* Providers
+** qdrant
+[[https://qdrant.tech/][qdrant]] is an open source vector database that 
concentrates mostly on running in the cloud, but can be run locally with a 
docker container.  They provide a free tier for your database in the cloud that 
may be garbage collected after a period of inactivity.
+
+A qdrant provider is defined like:
+
+#+begin_src emacs-lisp
+(defvar my-embed-provider (make-embed-qdrant-provider :api-key 
my-qdrant-api-key :url my-qdrant-url))
+#+end_src
+
+Substitute =my-qdrant-api-key=  with your key, and =my-qdrant-url= is the URL 
of the server that is used to serve your data.  This will be unique to your 
collection in the cloud, or a local URL for docker.
+** chroma
+[[https://www.trychroma.com/][chroma]] is an open source Python-centric vector 
database.  It can run as a server locally, or offers paid services to host in 
the cloud.  Currently this library only supports local running.
+
+If running locally, before use, you must run =chroma run= to start the server.
 
+The chroma provider has two additional divisions of data above the collection, 
and these are specified in the provider itself: the /tenant/ and the 
/database/.  These will both default to ="default"=, but can be specifed.  
Because the chroma provider is local, my default, no configuration is needed:
+
+#+begin_src emacs-lisp
+(defvar my-chroma-provider (make-chroma-provider))
+#+end_src
+
+However, the full set of options, here demonstrating the equivalent settings 
to the defaults are:
+
+#+begin_src emacs-lisp
+(defvar my-chroma-provider (make-chroma-provider
+                            :binary "chroma"
+                            :url "http://localhost:8000";
+                            :tenant "default"
+                            :database "default"))
+#+end_src
diff --git a/embed-chroma.el b/embed-chroma.el
new file mode 100644
index 0000000000..b14584b5ef
--- /dev/null
+++ b/embed-chroma.el
@@ -0,0 +1,230 @@
+;;; embed-chroma.el --- An interface to the chroma databases -*- 
lexical-binding: t; -*-
+
+;; Copyright (c) 2025  Free Software Foundation, Inc.
+
+;; Author: Andrew Hyatt <ahy...@gmail.com>
+;; Homepage: https://github.com/ahyatt/embed-db
+;; SPDX-License-Identifier: GPL-3.0-or-later
+;;
+;; This program is free software; you can redistribute it and/or
+;; modify it under the terms of the GNU General Public License as
+;; published by the Free Software Foundation; either version 3 of the
+;; License, or (at your option) any later version.
+;;
+;; This program is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.
+
+;;; Commentary:
+;; This package provides an interface to the chroma databases for embed-db.
+
+
+;;; Code:
+
+(require 'embed-db)
+(require 'plz)
+
+(defconst embed-chroma-collection-id-cache
+  (make-hash-table :test 'equal)
+  "Cache for chroma collection IDs.")
+
+(cl-defstruct (embed-chroma-provider (:include embed-db-provider
+                                               (name "chroma")))
+  (binary "chroma")
+  (url "http://localhost:8000";)
+  (tenant "default")
+  (database "default"))
+
+(defun embed-chroma-call (provider method url-suffix &optional body sync)
+  "Make an HTTP request to the Chroma API.
+If BODY is provided, it will be sent as the request body.
+SYNC indicates whether the request should be synchronous."
+  (let ((url (embed-chroma-provider-url provider)))
+    (unless url
+      (error "Chroma URL is not set"))
+    (if sync
+        (json-parse-string
+         (plz method (concat url url-suffix)
+           :headers '(("Content-Type" . "application/json"))
+           :body (if body (json-encode body) ""))
+         :object-type 'plist)
+      (plz method (concat url url-suffix)
+        :headers '(("Content-Type" . "application/json"))
+        :body (if body (json-encode body) "")
+        :then #'ignore))))
+
+(defun embed-chroma-has-tenant-p (provider)
+  "Check if the chroma PROVIDER has a tenant."
+  (condition-case err
+      (embed-chroma-call
+       provider
+       'get
+       (format "/api/v2/tenants/%s" (embed-chroma-provider-tenant provider))
+       nil
+       t)
+    (plz-error (if (eq 404 (plz-response-status (plz-error-response (nth 2 
err))))
+                   nil
+                 (error "Error checking tenant: %s" (plz-error-message 
err))))))
+
+(defun embed-chroma-has-database-p (provider database)
+  "Check if the chroma PROVIDER has a DATABASE."
+  (condition-case err
+      (embed-chroma-call
+       provider
+       'get
+       (format "/api/v2/tenants/%s/databases/%s"
+               (embed-chroma-provider-tenant provider)
+               database)
+       nil
+       t)
+    (plz-error (if (eq 404 (plz-response-status (plz-error-response (nth 2 
err))))
+                   nil
+                 (error "Error checking database: %s" (plz-error-message 
err))))))
+
+(cl-defmethod embed-db-create ((provider embed-chroma-provider)
+                               (collection embed-db-collection))
+  "Create a new chroma collection."
+  (unless (embed-chroma-has-tenant-p provider)
+    (embed-chroma-call
+     provider
+     'post
+     "/api/v2/tenants"
+     `(("name" . ,(embed-chroma-provider-tenant provider))) t))
+  (unless (embed-chroma-has-database-p provider 
(embed-chroma-provider-database provider))
+    (embed-chroma-call
+     provider
+     'post
+     (format "/api/v2/tenants/%s/databases"
+             (embed-chroma-provider-tenant provider))
+     `(:name ,(embed-chroma-provider-database provider)) t))
+  (embed-chroma-call
+   provider
+   'post
+   (format "/api/v2/tenants/%s/databases/%s/collections"
+           (embed-chroma-provider-tenant provider)
+           (embed-chroma-provider-database provider))
+   `(:name ,(embed-db-collection-name collection))
+   t))
+
+(cl-defmethod embed-db-delete ((provider embed-chroma-provider)
+                               (collection embed-db-collection))
+  "Delete a chroma collection."
+  (embed-chroma-call
+   provider
+   'delete
+   (format "/api/v2/tenants/%s/databases/%s/collections/%s"
+           (embed-chroma-provider-tenant provider)
+           (embed-chroma-provider-database provider)
+           (embed-db-collection-name collection))
+   nil
+   t))
+
+(defun embed-chroma-collection-id (provider collection)
+  "Get the ID of a chroma COLLECTION in PROVIDER."
+  (or (gethash (embed-db-collection-name collection)
+               embed-chroma-collection-id-cache)
+      (let* ((url (format "/api/v2/tenants/%s/databases/%s/collections/%s"
+                          (embed-chroma-provider-tenant provider)
+                          (embed-chroma-provider-database provider)
+                          (embed-db-collection-name collection)))
+             (result (embed-chroma-call provider 'get url nil t)))
+        (let ((id (plist-get result :id)))
+          (puthash (embed-db-collection-name collection) id
+                   embed-chroma-collection-id-cache)
+          id))))
+
+(cl-defmethod embed-db-exists ((provider embed-chroma-provider)
+                               (collection embed-db-collection))
+  "Check if a chroma collection exists."
+  (and (embed-chroma-has-tenant-p provider)
+       (embed-chroma-has-database-p provider
+                                    (embed-chroma-provider-database provider))
+       (condition-case nil
+           (embed-chroma-call
+            provider
+            'get
+            (format "/api/v2/tenants/%s/databases/%s/collections/%s"
+                    (embed-chroma-provider-tenant provider)
+                    (embed-chroma-provider-database provider)
+                    (embed-chroma-collection-id provider collection))
+            nil
+            t)
+         (plz-error nil))))
+
+(cl-defmethod embed-db-upsert-items ((provider embed-chroma-provider)
+                                     (collection embed-db-collection)
+                                     items &optional sync)
+  "Upsert items into a chroma collection."
+  (let ((url (format "/api/v2/tenants/%s/databases/%s/collections/%s/upsert"
+                     (embed-chroma-provider-tenant provider)
+                     (embed-chroma-provider-database provider)
+                     (embed-chroma-collection-id provider collection))))
+    (embed-chroma-call
+     provider
+     'post
+     url
+     `(:embeddings ,(apply #'vector (mapcar #'embed-db-item-vector items))
+                   :ids ,(apply #'vector (mapcar #'embed-db-item-id items))
+                   :metadatas ,(apply #'vector (mapcar #'embed-db-item-payload 
items)))
+     sync)))
+
+(cl-defmethod embed-db-get-item ((provider embed-chroma-provider)
+                                 (collection embed-db-collection)
+                                 item-id)
+  "Get a single item from a chroma collection by ITEM-ID."
+  (let* ((url (format "/api/v2/tenants/%s/databases/%s/collections/%s/get"
+                      (embed-chroma-provider-tenant provider)
+                      (embed-chroma-provider-database provider)
+                      (embed-chroma-collection-id provider collection)))
+         (result (embed-chroma-call provider 'get url
+                                    `(:ids ,(vector item-id)
+                                           :limit 1)
+                                    t)))
+    (unless (= (length (plist-get result :items)) 1)
+      (error "Expected exactly one item, got %d"
+             (length (plist-get result :items)))
+      (make-embed-db-item
+       :id (aref (plist-get result :ids) 0)
+       :vector (aref (plist-get result :embeddings) 0)
+       :payload (aref (plist-get result :metadatas) 0)))))
+
+(cl-defmethod embed-db-delete-items ((provider embed-chroma-provider)
+                                     (collection embed-db-collection)
+                                     item-ids &optional sync)
+  "Delete items from a chroma collection by ITEM-IDS."
+  (let ((url (format "/api/v2/tenants/%s/databases/%s/collections/%s/delete"
+                     (embed-chroma-provider-tenant provider)
+                     (embed-chroma-provider-database provider)
+                     (embed-chroma-collection-id provider collection))))
+    (embed-chroma-call
+     provider
+     'post
+     url
+     `(:ids ,(apply #'vector item-ids))
+     sync)))
+
+(cl-defmethod embed-db-search-by-vector ((provider embed-chroma-provider)
+                                         (collection embed-db-collection)
+                                         vector &optional limit)
+  "Search for items in a chroma collection by VECTOR."
+  (let* ((url (format "/api/v2/tenants/%s/databases/%s/collections/%s/query"
+                      (embed-chroma-provider-tenant provider)
+                      (embed-chroma-provider-database provider)
+                      (embed-chroma-collection-id provider collection)))
+         (result (embed-chroma-call provider 'post url
+                                    `(:query_embeddings [,vector]
+                                                        :n_results ,(or limit 
10)
+                                                        :include ["embeddings" 
"metadatas" "distances"])
+                                    t)))
+    (cl-loop for i from 0 below (length (aref (plist-get result :ids) 0))
+             collect (make-embed-db-item ;
+                      :id (aref (aref (plist-get result :ids) 0) i)
+                      :vector (aref (aref (plist-get result :embeddings) 0) i)
+                      :payload (aref (aref (plist-get result :metadatas) 0) 
i)))))
+
+(provide 'embed-chroma)
+;;; embed-chroma.el ends here

Reply via email to