(arrow-adbc) branch main updated: feat(python/adbc_driver_manager): add GetStatistics support (#4129)

lidavidm Mon, 23 Mar 2026 16:47:55 -0700

This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git



The following commit(s) were added to refs/heads/main by this push:
     new 464d39574 feat(python/adbc_driver_manager): add GetStatistics support 
(#4129)
464d39574 is described below

commit 464d395740f08a2199cf8627391ee0f687ca5fa4
Author: Mandukhai Alimaa <[email protected]>
AuthorDate: Mon Mar 23 18:47:42 2026 -0500

    feat(python/adbc_driver_manager): add GetStatistics support (#4129)
    
    Implement AdbcConnectionGetStatistics and
    AdbcConnectionGetStatisticNames
---
 .../adbc_driver_manager/_lib.pyi                   |  8 ++
 .../adbc_driver_manager/_lib.pyx                   | 93 +++++++++++++++++++++
 .../adbc_driver_manager/dbapi.py                   | 94 ++++++++++++++++++++++
 python/adbc_driver_manager/tests/test_dbapi.py     | 14 ++++
 python/adbc_driver_postgresql/tests/test_dbapi.py  | 43 ++++++++++
 5 files changed, 252 insertions(+)

diff --git a/python/adbc_driver_manager/adbc_driver_manager/_lib.pyi 
b/python/adbc_driver_manager/adbc_driver_manager/_lib.pyi
index 218006431..62a3a70c6 100644
--- a/python/adbc_driver_manager/adbc_driver_manager/_lib.pyi
+++ b/python/adbc_driver_manager/adbc_driver_manager/_lib.pyi
@@ -70,6 +70,14 @@ class AdbcConnection(_AdbcHandle):
         table_name: str,
     ) -> "ArrowSchemaHandle": ...
     def get_table_types(self) -> "ArrowArrayStreamHandle": ...
+    def get_statistics(
+        self,
+        catalog: Optional[str] = ...,
+        db_schema: Optional[str] = ...,
+        table_name: Optional[str] = ...,
+        approximate: bool = ...,
+    ) -> "ArrowArrayStreamHandle": ...
+    def get_statistic_names(self) -> "ArrowArrayStreamHandle": ...
     def read_partition(self, partition: bytes) -> "ArrowArrayStreamHandle": ...
     def rollback(self) -> None: ...
     def set_autocommit(self, enabled: bool) -> None: ...
diff --git a/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx 
b/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx
index 627b6fb38..87cfb4db6 100644
--- a/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx
+++ b/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx
@@ -1135,6 +1135,99 @@ cdef class AdbcConnection(_AdbcHandle):
         check_error(status, &c_error)
         return c_value
 
+    def get_statistics(
+        self,
+        catalog=None,
+        db_schema=None,
+        table_name=None,
+        approximate=True
+    ) -> ArrowArrayStreamHandle:
+        """
+        Get statistics about data distribution in table(s).
+
+        Parameters
+        ----------
+        catalog : str, optional
+            The catalog to filter by. May be a search pattern or None.
+        db_schema : str, optional
+            The database schema to filter by. May be a search pattern or None.
+        table_name : str, optional
+            The table name to filter by. May be a search pattern or None.
+        approximate : bool, default True
+            If True, allow approximate or cached statistics. If False, request
+            exact statistics (may be expensive or unsupported).
+
+        Returns
+        -------
+        ArrowArrayStreamHandle
+            A stream of statistics data with nested structure.
+
+        Notes
+        -----
+        Available since ADBC API revision 1.1.0. Drivers may return
+        approximate values even when exact values are requested, as
+        indicated in the result's statistic_is_approximate column.
+        """
+        cdef CAdbcError c_error = empty_error()
+        cdef CAdbcStatusCode status
+        cdef ArrowArrayStreamHandle stream = ArrowArrayStreamHandle()
+
+        cdef char* c_catalog = NULL
+        if catalog is not None:
+            catalog = _to_bytes(catalog, "catalog")
+            c_catalog = catalog
+
+        cdef char* c_db_schema = NULL
+        if db_schema is not None:
+            db_schema = _to_bytes(db_schema, "db_schema")
+            c_db_schema = db_schema
+
+        cdef char* c_table_name = NULL
+        if table_name is not None:
+            table_name = _to_bytes(table_name, "table_name")
+            c_table_name = table_name
+
+        cdef char c_approximate = 1 if approximate else 0
+
+        with nogil:
+            status = AdbcConnectionGetStatistics(
+                &self.connection,
+                c_catalog,
+                c_db_schema,
+                c_table_name,
+                c_approximate,
+                &stream.stream,
+                &c_error)
+        check_error(status, &c_error)
+        return stream
+
+    def get_statistic_names(self) -> ArrowArrayStreamHandle:
+        """
+        Get custom statistic names defined by this driver.
+
+        Returns
+        -------
+        ArrowArrayStreamHandle
+            A stream with columns: statistic_name (utf8), statistic_key 
(int16).
+
+        Notes
+        -----
+        Available since ADBC API revision 1.1.0. Returns driver-specific
+        statistic names and their keys. Standard ADBC statistics (keys 0-1023)
+        are not included - only driver-specific statistics.
+        """
+        cdef CAdbcError c_error = empty_error()
+        cdef CAdbcStatusCode status
+        cdef ArrowArrayStreamHandle stream = ArrowArrayStreamHandle()
+
+        with nogil:
+            status = AdbcConnectionGetStatisticNames(
+                &self.connection,
+                &stream.stream,
+                &c_error)
+        check_error(status, &c_error)
+        return stream
+
     def get_table_schema(self, catalog, db_schema, table_name) -> 
ArrowSchemaHandle:
         """
         Get the Arrow schema of a table.
diff --git a/python/adbc_driver_manager/adbc_driver_manager/dbapi.py 
b/python/adbc_driver_manager/adbc_driver_manager/dbapi.py
index 775baaa52..d3cb15750 100644
--- a/python/adbc_driver_manager/adbc_driver_manager/dbapi.py
+++ b/python/adbc_driver_manager/adbc_driver_manager/dbapi.py
@@ -556,6 +556,100 @@ class Connection(_Closeable):
         )
         return self._backend.import_array_stream(handle)
 
+    def adbc_get_statistics(
+        self,
+        *,
+        catalog_filter: Optional[str] = None,
+        db_schema_filter: Optional[str] = None,
+        table_name_filter: Optional[str] = None,
+        approximate: bool = True,
+    ) -> "pyarrow.RecordBatchReader":
+        """
+        Get statistics about the data distribution of table(s).
+
+        The result is an Arrow dataset with a nested structure containing
+        table statistics. The schema includes:
+
+        - catalog_name (utf8)
+        - catalog_db_schemas (list of structs)
+
+          - db_schema_name (utf8)
+          - db_schema_statistics (list of structs)
+
+            - table_name (utf8)
+            - column_name (utf8, nullable) - null if applies to entire table
+            - statistic_key (int16) - dictionary-encoded statistic name
+            - statistic_value (dense union) - int64, uint64, float64, or binary
+            - statistic_is_approximate (bool)
+
+        Parameters
+        ----------
+        catalog_filter
+            An optional filter on the catalog names. May be a search pattern.
+        db_schema_filter
+            An optional filter on the database schema names. May be a search 
pattern.
+        table_name_filter
+            An optional filter on the table names. May be a search pattern.
+        approximate
+            If True (default), allow approximate or cached statistics.
+            If False, request exact statistics, which may be expensive or
+            unsupported. Note that drivers may still return approximate values
+            as indicated by the statistic_is_approximate column.
+
+        Returns
+        -------
+        pyarrow.RecordBatchReader
+            A reader for the statistics data.
+
+        Notes
+        -----
+        This is an extension and not part of the DBAPI standard.
+
+        Available since ADBC API revision 1.1.0. Not all drivers support
+        this method. If unsupported, a NotSupportedError will be raised.
+        """
+        handle = _blocking_call(
+            self._conn.get_statistics,
+            (),
+            dict(
+                catalog=catalog_filter,
+                db_schema=db_schema_filter,
+                table_name=table_name_filter,
+                approximate=approximate,
+            ),
+            self._conn.cancel,
+        )
+        return self._backend.import_array_stream(handle)
+
+    def adbc_get_statistic_names(self) -> "pyarrow.RecordBatchReader":
+        """
+        Get a list of custom statistic names defined by this driver.
+
+        The result contains two columns:
+        - statistic_name (utf8): The human-readable name of the statistic
+        - statistic_key (int16): The numeric key used in get_statistics results
+
+        Returns
+        -------
+        pyarrow.RecordBatchReader
+            A reader for the statistic names.
+
+        Notes
+        -----
+        This is an extension and not part of the DBAPI standard.
+
+        Available since ADBC API revision 1.1.0. Standard ADBC statistics
+        (keys 0-1023) are not included in this result - only driver-specific
+        statistics.
+        """
+        handle = _blocking_call(
+            self._conn.get_statistic_names,
+            (),
+            {},
+            self._conn.cancel,
+        )
+        return self._backend.import_array_stream(handle)
+
     def adbc_get_table_schema(
         self,
         table_name: str,
diff --git a/python/adbc_driver_manager/tests/test_dbapi.py 
b/python/adbc_driver_manager/tests/test_dbapi.py
index 78506c3df..8653d94e8 100644
--- a/python/adbc_driver_manager/tests/test_dbapi.py
+++ b/python/adbc_driver_manager/tests/test_dbapi.py
@@ -131,6 +131,20 @@ def test_get_table_types(sqlite) -> None:
     assert sqlite.adbc_get_table_types() == ["table", "view"]
 
 
[email protected]
+def test_get_statistics_not_supported(sqlite) -> None:
+    """SQLite does not support GetStatistics."""
+    with pytest.raises(dbapi.NotSupportedError):
+        sqlite.adbc_get_statistics()
+
+
[email protected]
+def test_get_statistic_names_not_supported(sqlite) -> None:
+    """SQLite does not support GetStatisticNames."""
+    with pytest.raises(dbapi.NotSupportedError):
+        sqlite.adbc_get_statistic_names()
+
+
 class ArrayWrapper:
     def __init__(self, array) -> None:
         self.array = array
diff --git a/python/adbc_driver_postgresql/tests/test_dbapi.py 
b/python/adbc_driver_postgresql/tests/test_dbapi.py
index 952389de2..0fd68587c 100644
--- a/python/adbc_driver_postgresql/tests/test_dbapi.py
+++ b/python/adbc_driver_postgresql/tests/test_dbapi.py
@@ -102,6 +102,49 @@ def test_conn_get_info(postgres: dbapi.Connection) -> None:
     assert info["vendor_name"] == "PostgreSQL"
 
 
+def test_get_statistics(postgres: dbapi.Connection) -> None:
+    with postgres.cursor() as cur:
+        cur.execute("DROP TABLE IF EXISTS test_statistics")
+        cur.execute("CREATE TABLE test_statistics (id INT PRIMARY KEY, value 
TEXT)")
+        cur.execute("INSERT INTO test_statistics VALUES (1, 'a'), (2, 'b'), 
(3, 'c')")
+        cur.execute("ANALYZE test_statistics")
+    postgres.commit()
+
+    # PostgreSQL requires db_schema to be specified and only supports 
approximate stats
+    reader = postgres.adbc_get_statistics(
+        db_schema_filter="public", table_name_filter="test_statistics", 
approximate=True
+    )
+    assert reader is not None
+    table = reader.read_all()
+
+    # Verify schema is correct
+    assert "catalog_name" in table.schema.names
+    assert "catalog_db_schemas" in table.schema.names
+
+    # Verify we got actual statistics for our table
+    result_list = table.to_pylist()
+    found_test_table = False
+    for catalog in result_list:
+        for schema in catalog["catalog_db_schemas"]:
+            assert schema["db_schema_name"] == "public"
+            found_test_table = found_test_table or any(
+                stat["table_name"] == "test_statistics"
+                for stat in schema["db_schema_statistics"]
+            )
+
+    assert found_test_table, "Expected statistics for 'test_statistics'"
+
+
+def test_get_statistic_names(postgres: dbapi.Connection) -> None:
+    reader = postgres.adbc_get_statistic_names()
+    assert reader is not None
+    table = reader.read_all()
+
+    # Verify schema
+    assert "statistic_name" in table.schema.names
+    assert "statistic_key" in table.schema.names
+
+
 def test_query_batch_size(postgres: dbapi.Connection):
     with postgres.cursor() as cur:
         cur.execute("DROP TABLE IF EXISTS test_batch_size")

(arrow-adbc) branch main updated: feat(python/adbc_driver_manager): add GetStatistics support (#4129)

Reply via email to