This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git
The following commit(s) were added to refs/heads/main by this push:
new 464d39574 feat(python/adbc_driver_manager): add GetStatistics support
(#4129)
464d39574 is described below
commit 464d395740f08a2199cf8627391ee0f687ca5fa4
Author: Mandukhai Alimaa <[email protected]>
AuthorDate: Mon Mar 23 18:47:42 2026 -0500
feat(python/adbc_driver_manager): add GetStatistics support (#4129)
Implement AdbcConnectionGetStatistics and
AdbcConnectionGetStatisticNames
---
.../adbc_driver_manager/_lib.pyi | 8 ++
.../adbc_driver_manager/_lib.pyx | 93 +++++++++++++++++++++
.../adbc_driver_manager/dbapi.py | 94 ++++++++++++++++++++++
python/adbc_driver_manager/tests/test_dbapi.py | 14 ++++
python/adbc_driver_postgresql/tests/test_dbapi.py | 43 ++++++++++
5 files changed, 252 insertions(+)
diff --git a/python/adbc_driver_manager/adbc_driver_manager/_lib.pyi
b/python/adbc_driver_manager/adbc_driver_manager/_lib.pyi
index 218006431..62a3a70c6 100644
--- a/python/adbc_driver_manager/adbc_driver_manager/_lib.pyi
+++ b/python/adbc_driver_manager/adbc_driver_manager/_lib.pyi
@@ -70,6 +70,14 @@ class AdbcConnection(_AdbcHandle):
table_name: str,
) -> "ArrowSchemaHandle": ...
def get_table_types(self) -> "ArrowArrayStreamHandle": ...
+ def get_statistics(
+ self,
+ catalog: Optional[str] = ...,
+ db_schema: Optional[str] = ...,
+ table_name: Optional[str] = ...,
+ approximate: bool = ...,
+ ) -> "ArrowArrayStreamHandle": ...
+ def get_statistic_names(self) -> "ArrowArrayStreamHandle": ...
def read_partition(self, partition: bytes) -> "ArrowArrayStreamHandle": ...
def rollback(self) -> None: ...
def set_autocommit(self, enabled: bool) -> None: ...
diff --git a/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx
b/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx
index 627b6fb38..87cfb4db6 100644
--- a/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx
+++ b/python/adbc_driver_manager/adbc_driver_manager/_lib.pyx
@@ -1135,6 +1135,99 @@ cdef class AdbcConnection(_AdbcHandle):
check_error(status, &c_error)
return c_value
+ def get_statistics(
+ self,
+ catalog=None,
+ db_schema=None,
+ table_name=None,
+ approximate=True
+ ) -> ArrowArrayStreamHandle:
+ """
+ Get statistics about data distribution in table(s).
+
+ Parameters
+ ----------
+ catalog : str, optional
+ The catalog to filter by. May be a search pattern or None.
+ db_schema : str, optional
+ The database schema to filter by. May be a search pattern or None.
+ table_name : str, optional
+ The table name to filter by. May be a search pattern or None.
+ approximate : bool, default True
+ If True, allow approximate or cached statistics. If False, request
+ exact statistics (may be expensive or unsupported).
+
+ Returns
+ -------
+ ArrowArrayStreamHandle
+ A stream of statistics data with nested structure.
+
+ Notes
+ -----
+ Available since ADBC API revision 1.1.0. Drivers may return
+ approximate values even when exact values are requested, as
+ indicated in the result's statistic_is_approximate column.
+ """
+ cdef CAdbcError c_error = empty_error()
+ cdef CAdbcStatusCode status
+ cdef ArrowArrayStreamHandle stream = ArrowArrayStreamHandle()
+
+ cdef char* c_catalog = NULL
+ if catalog is not None:
+ catalog = _to_bytes(catalog, "catalog")
+ c_catalog = catalog
+
+ cdef char* c_db_schema = NULL
+ if db_schema is not None:
+ db_schema = _to_bytes(db_schema, "db_schema")
+ c_db_schema = db_schema
+
+ cdef char* c_table_name = NULL
+ if table_name is not None:
+ table_name = _to_bytes(table_name, "table_name")
+ c_table_name = table_name
+
+ cdef char c_approximate = 1 if approximate else 0
+
+ with nogil:
+ status = AdbcConnectionGetStatistics(
+ &self.connection,
+ c_catalog,
+ c_db_schema,
+ c_table_name,
+ c_approximate,
+ &stream.stream,
+ &c_error)
+ check_error(status, &c_error)
+ return stream
+
+ def get_statistic_names(self) -> ArrowArrayStreamHandle:
+ """
+ Get custom statistic names defined by this driver.
+
+ Returns
+ -------
+ ArrowArrayStreamHandle
+ A stream with columns: statistic_name (utf8), statistic_key
(int16).
+
+ Notes
+ -----
+ Available since ADBC API revision 1.1.0. Returns driver-specific
+ statistic names and their keys. Standard ADBC statistics (keys 0-1023)
+ are not included - only driver-specific statistics.
+ """
+ cdef CAdbcError c_error = empty_error()
+ cdef CAdbcStatusCode status
+ cdef ArrowArrayStreamHandle stream = ArrowArrayStreamHandle()
+
+ with nogil:
+ status = AdbcConnectionGetStatisticNames(
+ &self.connection,
+ &stream.stream,
+ &c_error)
+ check_error(status, &c_error)
+ return stream
+
def get_table_schema(self, catalog, db_schema, table_name) ->
ArrowSchemaHandle:
"""
Get the Arrow schema of a table.
diff --git a/python/adbc_driver_manager/adbc_driver_manager/dbapi.py
b/python/adbc_driver_manager/adbc_driver_manager/dbapi.py
index 775baaa52..d3cb15750 100644
--- a/python/adbc_driver_manager/adbc_driver_manager/dbapi.py
+++ b/python/adbc_driver_manager/adbc_driver_manager/dbapi.py
@@ -556,6 +556,100 @@ class Connection(_Closeable):
)
return self._backend.import_array_stream(handle)
+ def adbc_get_statistics(
+ self,
+ *,
+ catalog_filter: Optional[str] = None,
+ db_schema_filter: Optional[str] = None,
+ table_name_filter: Optional[str] = None,
+ approximate: bool = True,
+ ) -> "pyarrow.RecordBatchReader":
+ """
+ Get statistics about the data distribution of table(s).
+
+ The result is an Arrow dataset with a nested structure containing
+ table statistics. The schema includes:
+
+ - catalog_name (utf8)
+ - catalog_db_schemas (list of structs)
+
+ - db_schema_name (utf8)
+ - db_schema_statistics (list of structs)
+
+ - table_name (utf8)
+ - column_name (utf8, nullable) - null if applies to entire table
+ - statistic_key (int16) - dictionary-encoded statistic name
+ - statistic_value (dense union) - int64, uint64, float64, or binary
+ - statistic_is_approximate (bool)
+
+ Parameters
+ ----------
+ catalog_filter
+ An optional filter on the catalog names. May be a search pattern.
+ db_schema_filter
+ An optional filter on the database schema names. May be a search
pattern.
+ table_name_filter
+ An optional filter on the table names. May be a search pattern.
+ approximate
+ If True (default), allow approximate or cached statistics.
+ If False, request exact statistics, which may be expensive or
+ unsupported. Note that drivers may still return approximate values
+ as indicated by the statistic_is_approximate column.
+
+ Returns
+ -------
+ pyarrow.RecordBatchReader
+ A reader for the statistics data.
+
+ Notes
+ -----
+ This is an extension and not part of the DBAPI standard.
+
+ Available since ADBC API revision 1.1.0. Not all drivers support
+ this method. If unsupported, a NotSupportedError will be raised.
+ """
+ handle = _blocking_call(
+ self._conn.get_statistics,
+ (),
+ dict(
+ catalog=catalog_filter,
+ db_schema=db_schema_filter,
+ table_name=table_name_filter,
+ approximate=approximate,
+ ),
+ self._conn.cancel,
+ )
+ return self._backend.import_array_stream(handle)
+
+ def adbc_get_statistic_names(self) -> "pyarrow.RecordBatchReader":
+ """
+ Get a list of custom statistic names defined by this driver.
+
+ The result contains two columns:
+ - statistic_name (utf8): The human-readable name of the statistic
+ - statistic_key (int16): The numeric key used in get_statistics results
+
+ Returns
+ -------
+ pyarrow.RecordBatchReader
+ A reader for the statistic names.
+
+ Notes
+ -----
+ This is an extension and not part of the DBAPI standard.
+
+ Available since ADBC API revision 1.1.0. Standard ADBC statistics
+ (keys 0-1023) are not included in this result - only driver-specific
+ statistics.
+ """
+ handle = _blocking_call(
+ self._conn.get_statistic_names,
+ (),
+ {},
+ self._conn.cancel,
+ )
+ return self._backend.import_array_stream(handle)
+
def adbc_get_table_schema(
self,
table_name: str,
diff --git a/python/adbc_driver_manager/tests/test_dbapi.py
b/python/adbc_driver_manager/tests/test_dbapi.py
index 78506c3df..8653d94e8 100644
--- a/python/adbc_driver_manager/tests/test_dbapi.py
+++ b/python/adbc_driver_manager/tests/test_dbapi.py
@@ -131,6 +131,20 @@ def test_get_table_types(sqlite) -> None:
assert sqlite.adbc_get_table_types() == ["table", "view"]
[email protected]
+def test_get_statistics_not_supported(sqlite) -> None:
+ """SQLite does not support GetStatistics."""
+ with pytest.raises(dbapi.NotSupportedError):
+ sqlite.adbc_get_statistics()
+
+
[email protected]
+def test_get_statistic_names_not_supported(sqlite) -> None:
+ """SQLite does not support GetStatisticNames."""
+ with pytest.raises(dbapi.NotSupportedError):
+ sqlite.adbc_get_statistic_names()
+
+
class ArrayWrapper:
def __init__(self, array) -> None:
self.array = array
diff --git a/python/adbc_driver_postgresql/tests/test_dbapi.py
b/python/adbc_driver_postgresql/tests/test_dbapi.py
index 952389de2..0fd68587c 100644
--- a/python/adbc_driver_postgresql/tests/test_dbapi.py
+++ b/python/adbc_driver_postgresql/tests/test_dbapi.py
@@ -102,6 +102,49 @@ def test_conn_get_info(postgres: dbapi.Connection) -> None:
assert info["vendor_name"] == "PostgreSQL"
+def test_get_statistics(postgres: dbapi.Connection) -> None:
+ with postgres.cursor() as cur:
+ cur.execute("DROP TABLE IF EXISTS test_statistics")
+ cur.execute("CREATE TABLE test_statistics (id INT PRIMARY KEY, value
TEXT)")
+ cur.execute("INSERT INTO test_statistics VALUES (1, 'a'), (2, 'b'),
(3, 'c')")
+ cur.execute("ANALYZE test_statistics")
+ postgres.commit()
+
+ # PostgreSQL requires db_schema to be specified and only supports
approximate stats
+ reader = postgres.adbc_get_statistics(
+ db_schema_filter="public", table_name_filter="test_statistics",
approximate=True
+ )
+ assert reader is not None
+ table = reader.read_all()
+
+ # Verify schema is correct
+ assert "catalog_name" in table.schema.names
+ assert "catalog_db_schemas" in table.schema.names
+
+ # Verify we got actual statistics for our table
+ result_list = table.to_pylist()
+ found_test_table = False
+ for catalog in result_list:
+ for schema in catalog["catalog_db_schemas"]:
+ assert schema["db_schema_name"] == "public"
+ found_test_table = found_test_table or any(
+ stat["table_name"] == "test_statistics"
+ for stat in schema["db_schema_statistics"]
+ )
+
+ assert found_test_table, "Expected statistics for 'test_statistics'"
+
+
+def test_get_statistic_names(postgres: dbapi.Connection) -> None:
+ reader = postgres.adbc_get_statistic_names()
+ assert reader is not None
+ table = reader.read_all()
+
+ # Verify schema
+ assert "statistic_name" in table.schema.names
+ assert "statistic_key" in table.schema.names
+
+
def test_query_batch_size(postgres: dbapi.Connection):
with postgres.cursor() as cur:
cur.execute("DROP TABLE IF EXISTS test_batch_size")