This is an automated email from the ASF dual-hosted git repository.
rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new c187333336 GH-48241: [Python] Scalar inferencing doesn't infer UUID
(#48727)
c187333336 is described below
commit c187333336552cf2872a0e8fd240baabf547407c
Author: tadeja <[email protected]>
AuthorDate: Tue Mar 3 20:39:14 2026 +0100
GH-48241: [Python] Scalar inferencing doesn't infer UUID (#48727)
### Rationale for this change
This closes #48241, #44224 and #43855.
Currently uuid.UUID objects are not inferred/converted automatically in
PyArrow, requiring users to explicitly specify the type.
### What changes are included in this PR?
Adding support for Python's uuid.UUID objects in PyArrow's type inference
and conversion.
### Are these changes tested?
Yes, added test_uuid_scalar_from_python() and test_uuid_array_from_python()
in `test_extension.py`.
### Are there any user-facing changes?
Users can now pass Python uuid.UUID objects directly to PyArrow functions
like pa.scalar() and pa.array() without specifying the type;
```python
import uuid
import pyarrow as pa
pa.scalar(uuid.uuid4())
```
<pyarrow.UuidScalar: UUID('958174b9-3a5c-4cdd-8fc5-d51a2fc55784')>
```python
pa.array([uuid.uuid4()])
```
<pyarrow.lib.UuidArray object at 0x1217725f0>
[
73611FD81F764A209C8B9CDBADDA1F53
]
* GitHub Issue: #48241
Lead-authored-by: Tadeja Kadunc <[email protected]>
Co-authored-by: tadeja <[email protected]>
Co-authored-by: Rok Mihevc <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Rok Mihevc <[email protected]>
---
docs/source/python/extending_types.rst | 44 +++++-
python/pyarrow/src/arrow/python/common.h | 14 ++
python/pyarrow/src/arrow/python/helpers.cc | 154 ++++++++++++---------
python/pyarrow/src/arrow/python/helpers.h | 4 +
python/pyarrow/src/arrow/python/inference.cc | 8 ++
python/pyarrow/src/arrow/python/python_to_arrow.cc | 25 +++-
python/pyarrow/tests/test_extension_type.py | 87 ++++++++++++
7 files changed, 265 insertions(+), 71 deletions(-)
diff --git a/docs/source/python/extending_types.rst
b/docs/source/python/extending_types.rst
index 48262b6807..fec04c182a 100644
--- a/docs/source/python/extending_types.rst
+++ b/docs/source/python/extending_types.rst
@@ -476,8 +476,8 @@ You can find the official list of canonical extension types
in the
:ref:`format_canonical_extensions` section. Here we add examples on how to
use them in PyArrow.
-Fixed size tensor
-"""""""""""""""""
+Fixed shape tensor
+""""""""""""""""""
To create an array of tensors with equal shape (fixed shape tensor array) we
first need to define a fixed shape tensor extension type with value type
@@ -487,7 +487,7 @@ and shape:
>>> tensor_type = pa.fixed_shape_tensor(pa.int32(), (2, 2))
-Then we need the storage array with :func:`pyarrow.list_` type where
``value_type```
+Then we need the storage array with :func:`pyarrow.list_` type where
``value_type``
is the fixed shape tensor value type and list size is a product of
``tensor_type``
shape elements. Then we can create an array of tensors with
``pa.ExtensionArray.from_storage()`` method:
@@ -629,3 +629,41 @@ for ``NCHW`` format where:
* C: number of channels of the image
* H: height of the image
* W: width of the image
+
+UUID
+""""
+
+The UUID extension type (``arrow.uuid``) represents universally unique
+identifiers as 16-byte fixed-size binary values. PyArrow provides integration
+with Python's built-in :mod:`uuid` module, including automatic type inference.
+
+Creating UUID scalars and arrays
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PyArrow infers the UUID type from Python's ``uuid.UUID`` objects,
+so you can pass them directly to :func:`pyarrow.scalar` and
:func:`pyarrow.array`:
+
+.. code-block:: python
+
+ >>> import uuid
+ >>> import pyarrow as pa
+
+ >>> pa.scalar(uuid.uuid4())
+ <pyarrow.UuidScalar: UUID('...')>
+
+ >>> uuids = [uuid.uuid4() for _ in range(3)]
+ >>> arr = pa.array(uuids)
+ >>> arr.type
+ UuidType(extension<arrow.uuid>)
+
+You can also explicitly specify the UUID type using :func:`pyarrow.uuid`:
+
+.. code-block:: python
+
+ >>> pa.array([uuid.uuid4(), uuid.uuid4()], type=pa.uuid())
+ <pyarrow.lib.UuidArray object at ...>
+ [
+ ...,
+ ...
+ ]
+
diff --git a/python/pyarrow/src/arrow/python/common.h
b/python/pyarrow/src/arrow/python/common.h
index affefe2859..a81782330b 100644
--- a/python/pyarrow/src/arrow/python/common.h
+++ b/python/pyarrow/src/arrow/python/common.h
@@ -419,6 +419,20 @@ struct PyBytesView {
return Status::OK();
}
+ // Parse bytes from a uuid.UUID object (stores reference to keep bytes alive)
+ Status ParseUuid(PyObject* obj) {
+ ref.reset(PyObject_GetAttrString(obj, "bytes"));
+ RETURN_IF_PYERROR();
+ if (!PyBytes_Check(ref.obj())) {
+ return Status::TypeError("Expected uuid.UUID.bytes to return bytes, got
'",
+ Py_TYPE(ref.obj())->tp_name, "' object");
+ }
+ bytes = PyBytes_AS_STRING(ref.obj());
+ size = PyBytes_GET_SIZE(ref.obj());
+ is_utf8 = false;
+ return Status::OK();
+ }
+
protected:
OwnedRef ref;
};
diff --git a/python/pyarrow/src/arrow/python/helpers.cc
b/python/pyarrow/src/arrow/python/helpers.cc
index 0a24b25931..3515455d22 100644
--- a/python/pyarrow/src/arrow/python/helpers.cc
+++ b/python/pyarrow/src/arrow/python/helpers.cc
@@ -296,16 +296,69 @@ bool PyFloat_IsNaN(PyObject* obj) {
namespace {
-// This needs a conditional, because using std::once_flag could introduce
-// a deadlock when the GIL is enabled. See
-//
https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272
for
-// more info.
+// Thread-safe one-time Python module import + attribute lookup. For Pandas
and UUID.
+// Uses std::call_once when the GIL is disabled, or a simple boolean flag when
+// the GIL is enabled to avoid deadlocks. See ARROW-10519 for more details and
+//
https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272
+struct ModuleOnceRunner {
+ std::string module_name;
#ifdef Py_GIL_DISABLED
-static std::once_flag pandas_static_initialized;
+ std::once_flag initialized;
#else
-static bool pandas_static_initialized = false;
+ bool initialized = false;
#endif
+ explicit ModuleOnceRunner(const std::string& module_name) :
module_name(module_name) {}
+
+ template <typename Func>
+ void RunOnce(Func&& func) {
+ auto do_init = [&]() {
+ OwnedRef module;
+ if (ImportModule(module_name, &module).ok()) {
+#ifndef Py_GIL_DISABLED
+ // Since ImportModule can release the GIL, another thread could have
+ // already initialized the static data.
+ if (initialized) {
+ return;
+ }
+#endif
+ func(module);
+ }
+ };
+#ifdef Py_GIL_DISABLED
+ std::call_once(initialized, do_init);
+#else
+ if (!initialized) {
+ do_init();
+ initialized = true;
+ }
+#endif
+ }
+};
+
+static PyObject* uuid_UUID = nullptr;
+static ModuleOnceRunner uuid_runner("uuid");
+
+} // namespace
+
+bool IsPyUuid(PyObject* obj) {
+ uuid_runner.RunOnce([](OwnedRef& module) {
+ OwnedRef ref;
+ if (ImportFromModule(module.obj(), "UUID", &ref).ok()) {
+ uuid_UUID = ref.obj();
+ }
+ });
+ if (!uuid_UUID) return false;
+ int result = PyObject_IsInstance(obj, uuid_UUID);
+ if (result < 0) {
+ PyErr_Clear();
+ return false;
+ }
+ return result != 0;
+}
+
+namespace {
+
// Once initialized, these variables hold borrowed references to Pandas static
data.
// We should not use OwnedRef here because Python destructors would be
// called on a finalized interpreter.
@@ -315,72 +368,43 @@ static PyObject* pandas_Timedelta = nullptr;
static PyObject* pandas_Timestamp = nullptr;
static PyTypeObject* pandas_NaTType = nullptr;
static PyObject* pandas_DateOffset = nullptr;
+static ModuleOnceRunner pandas_runner("pandas");
-void GetPandasStaticSymbols() {
- OwnedRef pandas;
-
- // Import pandas
- Status s = ImportModule("pandas", &pandas);
- if (!s.ok()) {
- return;
- }
-
-#ifndef Py_GIL_DISABLED
- // Since ImportModule can release the GIL, another thread could have
- // already initialized the static data.
- if (pandas_static_initialized) {
- return;
- }
-#endif
-
- OwnedRef ref;
-
- // set NaT sentinel and its type
- if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) {
- pandas_NaT = ref.obj();
- // PyObject_Type returns a new reference but we trust that pandas.NaT will
- // outlive our use of this PyObject*
- pandas_NaTType = Py_TYPE(ref.obj());
- }
-
- // retain a reference to Timedelta
- if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) {
- pandas_Timedelta = ref.obj();
- }
+} // namespace
- // retain a reference to Timestamp
- if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) {
- pandas_Timestamp = ref.obj();
- }
+void InitPandasStaticData() {
+ pandas_runner.RunOnce([](OwnedRef& module) {
+ OwnedRef ref;
+
+ // set NaT sentinel and its type
+ if (ImportFromModule(module.obj(), "NaT", &ref).ok()) {
+ pandas_NaT = ref.obj();
+ // PyObject_Type returns a new reference but we trust that pandas.NaT
will
+ // outlive our use of this PyObject*
+ pandas_NaTType = Py_TYPE(ref.obj());
+ }
- // if pandas.NA exists, retain a reference to it
- if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
- pandas_NA = ref.obj();
- }
+ // retain a reference to Timedelta
+ if (ImportFromModule(module.obj(), "Timedelta", &ref).ok()) {
+ pandas_Timedelta = ref.obj();
+ }
- // Import DateOffset type
- if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) {
- pandas_DateOffset = ref.obj();
- }
-}
+ // retain a reference to Timestamp
+ if (ImportFromModule(module.obj(), "Timestamp", &ref).ok()) {
+ pandas_Timestamp = ref.obj();
+ }
-} // namespace
+ // if pandas.NA exists, retain a reference to it
+ if (ImportFromModule(module.obj(), "NA", &ref).ok()) {
+ pandas_NA = ref.obj();
+ }
-#ifdef Py_GIL_DISABLED
-void InitPandasStaticData() {
- std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
-}
-#else
-void InitPandasStaticData() {
- // NOTE: This is called with the GIL held. We needn't (and shouldn't,
- // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
- if (pandas_static_initialized) {
- return;
- }
- GetPandasStaticSymbols();
- pandas_static_initialized = true;
+ // Import DateOffset type
+ if (ImportFromModule(module.obj(), "DateOffset", &ref).ok()) {
+ pandas_DateOffset = ref.obj();
+ }
+ });
}
-#endif
bool PandasObjectIsNull(PyObject* obj) {
if (!MayHaveNaN(obj)) {
diff --git a/python/pyarrow/src/arrow/python/helpers.h
b/python/pyarrow/src/arrow/python/helpers.h
index b0cf101028..b4417a9644 100644
--- a/python/pyarrow/src/arrow/python/helpers.h
+++ b/python/pyarrow/src/arrow/python/helpers.h
@@ -92,6 +92,10 @@ PyObject* BorrowPandasDataOffsetType();
ARROW_PYTHON_EXPORT
bool PyFloat_IsNaN(PyObject* obj);
+// \brief Check whether obj is a uuid.UUID instance
+ARROW_PYTHON_EXPORT
+bool IsPyUuid(PyObject* obj);
+
inline bool IsPyBinary(PyObject* obj) {
return PyBytes_Check(obj) || PyByteArray_Check(obj) ||
PyMemoryView_Check(obj);
}
diff --git a/python/pyarrow/src/arrow/python/inference.cc
b/python/pyarrow/src/arrow/python/inference.cc
index 06cb469483..291cc42149 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -27,6 +27,7 @@
#include <utility>
#include <vector>
+#include "arrow/extension/uuid.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/util/decimal.h"
@@ -407,6 +408,7 @@ class TypeInferrer {
arrow_scalar_count_(0),
numpy_dtype_count_(0),
interval_count_(0),
+ uuid_count_(0),
max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
std::numeric_limits<int32_t>::min()),
decimal_type_() {
@@ -475,6 +477,9 @@ class TypeInferrer {
++decimal_count_;
} else if (PyObject_IsInstance(obj, interval_types_.obj())) {
++interval_count_;
+ } else if (internal::IsPyUuid(obj)) {
+ ++uuid_count_;
+ *keep_going = make_unions_;
} else {
return internal::InvalidValue(obj,
"did not recognize Python value type when
inferring "
@@ -604,6 +609,8 @@ class TypeInferrer {
*out = utf8();
} else if (interval_count_) {
*out = month_day_nano_interval();
+ } else if (uuid_count_) {
+ *out = extension::uuid();
} else if (arrow_scalar_count_) {
*out = scalar_type_;
} else {
@@ -766,6 +773,7 @@ class TypeInferrer {
int64_t arrow_scalar_count_;
int64_t numpy_dtype_count_;
int64_t interval_count_;
+ int64_t uuid_count_;
std::unique_ptr<TypeInferrer> list_inferrer_;
std::vector<std::pair<std::string, TypeInferrer>> struct_inferrers_;
std::unordered_map<std::string, size_t> struct_field_index_;
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 139eb1d7f4..c70510a480 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -36,6 +36,7 @@
#include "arrow/array/builder_primitive.h"
#include "arrow/array/builder_time.h"
#include "arrow/chunked_array.h"
+#include "arrow/extension_type.h"
#include "arrow/result.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
@@ -512,7 +513,12 @@ class PyValue {
static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
PyBytesView& view) {
- ARROW_RETURN_NOT_OK(view.ParseString(obj));
+ // Check if obj is a uuid.UUID instance
+ if (internal::IsPyUuid(obj)) {
+ ARROW_RETURN_NOT_OK(view.ParseUuid(obj));
+ } else {
+ ARROW_RETURN_NOT_OK(view.ParseString(obj));
+ }
if (view.size != type->byte_width()) {
std::stringstream ss;
ss << "expected to be length " << type->byte_width() << " was " <<
view.size;
@@ -1268,9 +1274,16 @@ Result<std::shared_ptr<ChunkedArray>>
ConvertPySequence(PyObject* obj, PyObject*
// In some cases, type inference may be "loose", like strings. If the user
// passed pa.string(), then we will error if we encounter any non-UTF8
// value. If not, then we will allow the result to be a BinaryArray
+ std::shared_ptr<DataType> extension_type;
if (options.type == nullptr) {
ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask,
options.from_pandas));
options.strict = false;
+ // If type inference returned an extension type, convert using
+ // the storage type and then wrap the result as an extension array
+ if (options.type->id() == Type::EXTENSION) {
+ extension_type = options.type;
+ options.type = checked_cast<const
ExtensionType&>(*options.type).storage_type();
+ }
} else {
options.strict = true;
}
@@ -1278,6 +1291,7 @@ Result<std::shared_ptr<ChunkedArray>>
ConvertPySequence(PyObject* obj, PyObject*
ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter,
PyConverterTrait>(
options.type, options, pool)));
+ std::shared_ptr<ChunkedArray> result;
if (converter->may_overflow()) {
// The converter hierarchy contains binary- or list-like builders which
can overflow
// depending on the input values. Wrap the converter with a chunker which
detects
@@ -1288,7 +1302,7 @@ Result<std::shared_ptr<ChunkedArray>>
ConvertPySequence(PyObject* obj, PyObject*
} else {
RETURN_NOT_OK(chunked_converter->Extend(seq, size));
}
- return chunked_converter->ToChunkedArray();
+ ARROW_ASSIGN_OR_RAISE(result, chunked_converter->ToChunkedArray());
} else {
// If the converter can't overflow spare the capacity error checking on
the hot-path,
// this improves the performance roughly by ~10% for primitive types.
@@ -1297,8 +1311,13 @@ Result<std::shared_ptr<ChunkedArray>>
ConvertPySequence(PyObject* obj, PyObject*
} else {
RETURN_NOT_OK(converter->Extend(seq, size));
}
- return converter->ToChunkedArray();
+ ARROW_ASSIGN_OR_RAISE(result, converter->ToChunkedArray());
+ }
+ // If we inferred an extension type, wrap as an extension array
+ if (extension_type != nullptr) {
+ return ExtensionType::WrapArray(extension_type, result);
}
+ return result;
}
} // namespace py
diff --git a/python/pyarrow/tests/test_extension_type.py
b/python/pyarrow/tests/test_extension_type.py
index c947b06e0e..66fcfc0556 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1399,6 +1399,93 @@ def test_uuid_extension():
assert isinstance(array[0], pa.UuidScalar)
+def test_uuid_scalar_from_python():
+ # Test with explicit type
+ py_uuid = uuid4()
+ scalar = pa.scalar(py_uuid, type=pa.uuid())
+ assert isinstance(scalar, pa.UuidScalar)
+ assert scalar.type == pa.uuid()
+ assert scalar.as_py() == py_uuid
+
+ # Test with specific UUID value
+ specific_uuid = UUID("12345678-1234-5678-1234-567812345678")
+ scalar = pa.scalar(specific_uuid, type=pa.uuid())
+ assert scalar.as_py() == specific_uuid
+ assert scalar.value.as_py() == specific_uuid.bytes
+
+ scalar = pa.scalar(None, type=pa.uuid())
+ assert scalar.is_valid is False
+ assert scalar.as_py() is None
+
+ # Test type inference from uuid.UUID
+ py_uuid = uuid4()
+ scalar = pa.scalar(py_uuid)
+ assert isinstance(scalar, pa.UuidScalar)
+ assert scalar.type == pa.uuid()
+ assert scalar.as_py() == py_uuid
+
+
+def test_uuid_array_from_python():
+ # Test array with explicit type
+ uuids = [uuid4() for _ in range(3)]
+ uuids.append(None)
+
+ arr = pa.array(uuids, type=pa.uuid())
+ assert arr.type == pa.uuid()
+ assert len(arr) == 4
+ assert arr.null_count == 1
+ for i, u in enumerate(uuids):
+ assert arr[i].as_py() == u
+
+ # Test type inference for arrays
+ arr = pa.array(uuids)
+ assert arr.type == pa.uuid()
+ for i, u in enumerate(uuids):
+ assert arr[i].as_py() == u
+
+
[email protected]("bytes_value,exc_type,match", [
+ (b"0123456789abcde", pa.ArrowInvalid, "expected to be length 16 was 15"),
+ (
+ "0123456789abcdef", TypeError,
+ "Expected uuid.UUID.bytes to return bytes, got 'str'"
+ ),
+ (None, TypeError, "Expected uuid.UUID.bytes to return bytes, got
'NoneType'"),
+])
+def test_uuid_bytes_property_not_bytes(bytes_value, exc_type, match):
+ class BadUuid(UUID):
+ @property
+ def bytes(self):
+ return bytes_value
+
+ bad = BadUuid(uuid4().hex)
+ with pytest.raises(exc_type, match=match):
+ pa.array([bad], type=pa.uuid())
+ with pytest.raises(exc_type, match=match):
+ pa.scalar(bad, type=pa.uuid())
+ with pytest.raises(exc_type, match=match):
+ pa.array([bad])
+ with pytest.raises(exc_type, match=match):
+ pa.scalar(bad)
+
+
+def test_uuid_bytes_property_raises():
+ class BadUuid(UUID):
+ @property
+ def bytes(self):
+ raise RuntimeError("broken")
+
+ bad = BadUuid(uuid4().hex)
+ with pytest.raises(RuntimeError, match="broken"):
+ pa.array([bad], type=pa.uuid())
+ with pytest.raises(RuntimeError, match="broken"):
+ pa.scalar(bad, type=pa.uuid())
+ with pytest.raises(RuntimeError, match="broken"):
+ pa.array([bad])
+ with pytest.raises(RuntimeError, match="broken"):
+ pa.scalar(bad)
+
+
def test_tensor_type():
tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
assert tensor_type.extension_name == "arrow.fixed_shape_tensor"