branch: elpa/gptel
commit 4abb9581ff8aa728bfe29b8fbc987a21166f3813
Author: Karthik Chikmagalur <karthikchikmaga...@gmail.com>
Commit: Karthik Chikmagalur <karthikchikmaga...@gmail.com>

    gptel: Add JSON response schema parsing and preprocessing
    
    Add experimental support for structured outputs to `gptel-request'
    via the :schema argument.  It is now possible to force an LLM to
    respond in JSON conforming to the provided schema.
    
    Note: This commit only adds the infrastructure required for the
    feature!  No backend currently respects :schema.  Support for all
    backends will be added in the next commit.
    
    There are several caveats with this feature in its current form:
    
    1. Not all providers support it, but the major backends do:
    OpenAI, Anthropic, Gemini, llama-cpp and Ollama.
    Support for structured outputs among other "OpenAI-compatible"
    backends is flaky.
    
    2. `gptel-send' does not yet support structured outputs, as it is
    intended to be a general chat command.  Only the `gptel-request'
    API does.  (Schema support for `gptel-send' can be added if there
    is sufficient demand.)
    
    3. Schemas whose root elements are of type array are not
    supported by most APIs.  In this case the schema is wrapped in an
    object with one field and it is the caller's responsibility to
    extract the array elements from it.
    
    4. The JSON schema has to be supplied in one of two ways:
    - As an elisp object consisting of nested plists, similar to how
    arguments in gptel-tool definitions are provided.
    - As a JSON schema serialized to a string.
    While expressive, both formats are cumbersome for quick use, so
    support for other short hand specifications is planned.
    
    * gptel.el (gptel--with-buffer-copy-internal): Copy
    `gptel--schema' as well.
    
    (gptel--schema, gptel-request): Add :schema argument, use the
    internal variable `gptel--schema' to communicate this to the
    payload builders (primarily `gptel--request-data').  The docstring
    for :schema is inadequate, but it will require too many lines in
    an already long description.  This will be updated after adding
    other ways to specify the schema.
    
    (gptel--parse-schema): Generic function to parse a provided schema
    into a backend-appropriate format.
    
    (gptel--preprocess-schema, gptel--dispatch-schema-type): Utility
    functions to sanitize provided schemas.  The former is required to
    convert all symbols in the spec to strings (see
    `gptel--preprocess-tool-args' for why).  The latter handles
    schemas provided as serialized JSON, and wraps a root-level array
    specification in an object.  This wrapping is needed since most
    APIs require an object type at the schema root.
    
    (gptel--tool-use-p, gptel--tool-result-p): Don't check for
    `:tools' in INFO, as the Anthropic API uses an ersatz tool to
    provide JSON output as tool call arguments.  This ersatz tool is
    not defined by the user and not included in `:tools'.
    
    (gptel--handle-tool-use): Handle JSON output masquerading as a
    tool call.  This is for the Anthropic API only.
---
 gptel.el | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 77 insertions(+), 7 deletions(-)

diff --git a/gptel.el b/gptel.el
index a05d4c004a..38eaf754f7 100644
--- a/gptel.el
+++ b/gptel.el
@@ -914,6 +914,11 @@ These parameters are combined with model-specific and 
backend-specific
 incompatible with the active backend can break gptel.  Do not use this
 variable unless you know what you're doing!")
 
+(defconst gptel--ersatz-json-tool "response_json"
+  "Name of ersatz tool used to force JSON output.
+
+Some APIs, like Anthropic, use a tool to produce structured JSON output.")
+
 
 ;;; Utility functions
 
@@ -1092,7 +1097,7 @@ For BUF, START, END and BODY-THUNK see 
`gptel--with-buffer-copy'."
     (with-current-buffer temp-buffer
       (dolist (sym '( gptel-backend gptel--system-message gptel-model
                       gptel-mode gptel-track-response gptel-track-media
-                      gptel-use-tools gptel-tools gptel-use-curl
+                      gptel-use-tools gptel-tools gptel-use-curl gptel--schema
                       gptel-use-context gptel--num-messages-to-send
                       gptel-stream gptel-include-reasoning 
gptel--request-params
                       gptel-temperature gptel-max-tokens gptel-cache))
@@ -1577,6 +1582,60 @@ file."
 (declare-function gptel-context--wrap "gptel-context")
 
 
+;;; Structured output
+(defvar gptel--schema nil
+  "Response output schema for backends that support it.")
+
+(cl-defgeneric gptel--parse-schema (_backend _schema)
+  "Parse JSON schema in a backend-appropriate way.")
+
+(defun gptel--dispatch-schema-type (schema)
+  "Convert SCHEMA to a valid elisp representation."
+  (when (stringp schema)
+    (setq schema (gptel--json-read-string schema)))
+  ;; The OpenAI and Anthropic APIs don't allow arrays at the root of the 
schema.
+  ;; Work around this by wrapping it in an object with the field "items".
+  ;; TODO(schema): Find some way to strip this extra layer from the response.
+  (if (member (plist-get schema :type) '("array" array))
+      (list :type "object"
+            :properties (list :items schema)
+            :required ["items"]
+            :additionalProperties :json-false)
+    schema))
+
+(defun gptel--preprocess-schema (spec)
+  "Set additionalProperties for objects in SPEC destructively.
+
+Convert symbol :types to strings."
+  ;; NOTE: Do not use `sequencep' here, as that covers strings too and breaks
+  ;; things.
+  (when (or (listp spec) (vectorp spec))
+    (cond
+     ((vectorp spec)
+      (cl-loop for element across spec
+               for idx upfrom 0
+               do (aset spec idx (gptel--preprocess-schema element))))
+     ((keywordp (car spec))
+      (let ((tail spec))
+        (while tail
+          (when (eq (car tail) :type)
+            (when (symbolp (cadr tail)) ;Convert symbol :type to string
+              (setcar (cdr tail) (symbol-name (cadr tail))))
+            (when (equal (cadr tail) "object") ;Add additional object fields
+              (plist-put tail :additionalProperties :json-false)
+              (let ((props
+                     (cl-loop for prop in (plist-get tail :properties) by 
#'cddr
+                              collect (substring (symbol-name prop) 1))))
+                (plist-put tail :required (vconcat props)))))
+          (when (or (listp (cadr tail)) (vectorp (cadr tail)))
+            (gptel--preprocess-schema (cadr tail)))
+          (setq tail (cddr tail)))))
+     ((listp spec) (dolist (element spec)
+                     (when (listp element)
+                       (gptel--preprocess-schema element))))))
+  spec)
+
+
 ;;; Tool use
 
 (defcustom gptel-use-tools t
@@ -2242,7 +2301,12 @@ Run post-response hooks."
                                   (cons 'tool-result result-alist) info)
                          (gptel--fsm-transition fsm)))))
              (if (null tool-spec)
-                 (message "Unknown tool called by model: %s" name)
+                 (if (equal name gptel--ersatz-json-tool) ;Could be a JSON 
response
+                     ;; Handle structured JSON output supplied as tool call
+                     (funcall (plist-get info :callback)
+                              (gptel--json-encode (plist-get tool-call :args))
+                              info)
+                   (message "Unknown tool called by model: %s" name))
                (setq arg-values
                      (mapcar
                       (lambda (arg)
@@ -2277,11 +2341,9 @@ Run post-response hooks."
 
 (defun gptel--error-p (info) (plist-get info :error))
 
-(defun gptel--tool-use-p (info)
-  (and (plist-get info :tools) (plist-get info :tool-use)))
+(defun gptel--tool-use-p (info) (plist-get info :tool-use))
 
-(defun gptel--tool-result-p (info)
-  (and (plist-get info :tools) (plist-get info :tool-success)))
+(defun gptel--tool-result-p (info) (plist-get info :tool-success))
 
 ;; TODO(prompt-list): Document new prompt input format to `gptel-request'.
 
@@ -2292,7 +2354,7 @@ Run post-response hooks."
                position context dry-run
                (stream nil) (in-place nil)
                (system gptel--system-message)
-               transforms (fsm (gptel-make-fsm)))
+               schema transforms (fsm (gptel-make-fsm)))
   "Request a response from the `gptel-backend' for PROMPT.
 
 The request is asynchronous, this function returns immediately.
@@ -2442,6 +2504,13 @@ additional information (such as from a RAG engine).
   and the state machine.  It should run the callback after finishing its
   transformation.
 
+If provided, SCHEMA forces the LLM to generate JSON output.  Its value
+is a JSON schema, which can be provided as an elisp object, a nested
+plist structure.  See the manual or the wiki for examples.
+
+Note: SCHEMA is presently experimental and subject to change, and not
+all providers support structured output.
+
 See `gptel-prompt-transform-functions' for more.
 
 FSM is the state machine driving the request.  This can be used
@@ -2470,6 +2539,7 @@ be used to rerun or continue the request at a later time."
            ((markerp position) position)
            ((integerp position)
             (set-marker (make-marker) position buffer))))
+         (gptel--schema schema)
          (prompt-buffer
           (cond                       ;prompt from buffer or explicitly 
supplied
            ((null prompt)

Reply via email to