This is an automated email from the ASF dual-hosted git repository.

rok pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new c187333336 GH-48241: [Python] Scalar inferencing doesn't infer UUID 
(#48727)
c187333336 is described below

commit c187333336552cf2872a0e8fd240baabf547407c
Author: tadeja <[email protected]>
AuthorDate: Tue Mar 3 20:39:14 2026 +0100

    GH-48241: [Python] Scalar inferencing doesn't infer UUID (#48727)
    
    ### Rationale for this change
    This closes #48241, #44224 and #43855.
    Currently uuid.UUID objects are not inferred/converted automatically in 
PyArrow, requiring users to explicitly specify the type.
    
    ### What changes are included in this PR?
    Adding support for Python's uuid.UUID objects in PyArrow's type inference 
and conversion.
    
    ### Are these changes tested?
    Yes, added test_uuid_scalar_from_python() and test_uuid_array_from_python() 
in `test_extension.py`.
    
    ### Are there any user-facing changes?
    Users can now pass Python uuid.UUID objects directly to PyArrow functions 
like pa.scalar() and pa.array() without specifying the type;
    ```python
    import uuid
    import pyarrow as pa
    
    pa.scalar(uuid.uuid4())
    ```
    <pyarrow.UuidScalar: UUID('958174b9-3a5c-4cdd-8fc5-d51a2fc55784')>
    ```python
    pa.array([uuid.uuid4()])
    ```
     <pyarrow.lib.UuidArray object at 0x1217725f0>
    [
      73611FD81F764A209C8B9CDBADDA1F53
    ]
    * GitHub Issue: #48241
    
    Lead-authored-by: Tadeja Kadunc <[email protected]>
    Co-authored-by: tadeja <[email protected]>
    Co-authored-by: Rok Mihevc <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Rok Mihevc <[email protected]>
---
 docs/source/python/extending_types.rst             |  44 +++++-
 python/pyarrow/src/arrow/python/common.h           |  14 ++
 python/pyarrow/src/arrow/python/helpers.cc         | 154 ++++++++++++---------
 python/pyarrow/src/arrow/python/helpers.h          |   4 +
 python/pyarrow/src/arrow/python/inference.cc       |   8 ++
 python/pyarrow/src/arrow/python/python_to_arrow.cc |  25 +++-
 python/pyarrow/tests/test_extension_type.py        |  87 ++++++++++++
 7 files changed, 265 insertions(+), 71 deletions(-)

diff --git a/docs/source/python/extending_types.rst 
b/docs/source/python/extending_types.rst
index 48262b6807..fec04c182a 100644
--- a/docs/source/python/extending_types.rst
+++ b/docs/source/python/extending_types.rst
@@ -476,8 +476,8 @@ You can find the official list of canonical extension types 
in the
 :ref:`format_canonical_extensions` section. Here we add examples on how to
 use them in PyArrow.
 
-Fixed size tensor
-"""""""""""""""""
+Fixed shape tensor
+""""""""""""""""""
 
 To create an array of tensors with equal shape (fixed shape tensor array) we
 first need to define a fixed shape tensor extension type with value type
@@ -487,7 +487,7 @@ and shape:
 
    >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), (2, 2))
 
-Then we need the storage array with :func:`pyarrow.list_` type where 
``value_type```
+Then we need the storage array with :func:`pyarrow.list_` type where 
``value_type``
 is the fixed shape tensor value type and list size is a product of 
``tensor_type``
 shape elements. Then we can create an array of tensors with
 ``pa.ExtensionArray.from_storage()`` method:
@@ -629,3 +629,41 @@ for ``NCHW`` format where:
 * C: number of channels of the image
 * H: height of the image
 * W: width of the image
+
+UUID
+""""
+
+The UUID extension type (``arrow.uuid``) represents universally unique
+identifiers as 16-byte fixed-size binary values. PyArrow provides integration
+with Python's built-in :mod:`uuid` module, including automatic type inference.
+
+Creating UUID scalars and arrays
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+PyArrow infers the UUID type from Python's ``uuid.UUID`` objects,
+so you can pass them directly to :func:`pyarrow.scalar` and 
:func:`pyarrow.array`:
+
+.. code-block:: python
+
+   >>> import uuid
+   >>> import pyarrow as pa
+
+   >>> pa.scalar(uuid.uuid4())
+   <pyarrow.UuidScalar: UUID('...')>
+
+   >>> uuids = [uuid.uuid4() for _ in range(3)]
+   >>> arr = pa.array(uuids)
+   >>> arr.type
+   UuidType(extension<arrow.uuid>)
+
+You can also explicitly specify the UUID type using :func:`pyarrow.uuid`:
+
+.. code-block:: python
+
+   >>> pa.array([uuid.uuid4(), uuid.uuid4()], type=pa.uuid())
+   <pyarrow.lib.UuidArray object at ...>
+   [
+     ...,
+     ...
+   ]
+
diff --git a/python/pyarrow/src/arrow/python/common.h 
b/python/pyarrow/src/arrow/python/common.h
index affefe2859..a81782330b 100644
--- a/python/pyarrow/src/arrow/python/common.h
+++ b/python/pyarrow/src/arrow/python/common.h
@@ -419,6 +419,20 @@ struct PyBytesView {
     return Status::OK();
   }
 
+  // Parse bytes from a uuid.UUID object (stores reference to keep bytes alive)
+  Status ParseUuid(PyObject* obj) {
+    ref.reset(PyObject_GetAttrString(obj, "bytes"));
+    RETURN_IF_PYERROR();
+    if (!PyBytes_Check(ref.obj())) {
+      return Status::TypeError("Expected uuid.UUID.bytes to return bytes, got 
'",
+                               Py_TYPE(ref.obj())->tp_name, "' object");
+    }
+    bytes = PyBytes_AS_STRING(ref.obj());
+    size = PyBytes_GET_SIZE(ref.obj());
+    is_utf8 = false;
+    return Status::OK();
+  }
+
  protected:
   OwnedRef ref;
 };
diff --git a/python/pyarrow/src/arrow/python/helpers.cc 
b/python/pyarrow/src/arrow/python/helpers.cc
index 0a24b25931..3515455d22 100644
--- a/python/pyarrow/src/arrow/python/helpers.cc
+++ b/python/pyarrow/src/arrow/python/helpers.cc
@@ -296,16 +296,69 @@ bool PyFloat_IsNaN(PyObject* obj) {
 
 namespace {
 
-// This needs a conditional, because using std::once_flag could introduce
-// a deadlock when the GIL is enabled. See
-// 
https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 
for
-// more info.
+// Thread-safe one-time Python module import + attribute lookup. For Pandas 
and UUID.
+// Uses std::call_once when the GIL is disabled, or a simple boolean flag when
+// the GIL is enabled to avoid deadlocks. See ARROW-10519 for more details and
+// 
https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272
+struct ModuleOnceRunner {
+  std::string module_name;
 #ifdef Py_GIL_DISABLED
-static std::once_flag pandas_static_initialized;
+  std::once_flag initialized;
 #else
-static bool pandas_static_initialized = false;
+  bool initialized = false;
 #endif
 
+  explicit ModuleOnceRunner(const std::string& module_name) : 
module_name(module_name) {}
+
+  template <typename Func>
+  void RunOnce(Func&& func) {
+    auto do_init = [&]() {
+      OwnedRef module;
+      if (ImportModule(module_name, &module).ok()) {
+#ifndef Py_GIL_DISABLED
+        // Since ImportModule can release the GIL, another thread could have
+        // already initialized the static data.
+        if (initialized) {
+          return;
+        }
+#endif
+        func(module);
+      }
+    };
+#ifdef Py_GIL_DISABLED
+    std::call_once(initialized, do_init);
+#else
+    if (!initialized) {
+      do_init();
+      initialized = true;
+    }
+#endif
+  }
+};
+
+static PyObject* uuid_UUID = nullptr;
+static ModuleOnceRunner uuid_runner("uuid");
+
+}  // namespace
+
+bool IsPyUuid(PyObject* obj) {
+  uuid_runner.RunOnce([](OwnedRef& module) {
+    OwnedRef ref;
+    if (ImportFromModule(module.obj(), "UUID", &ref).ok()) {
+      uuid_UUID = ref.obj();
+    }
+  });
+  if (!uuid_UUID) return false;
+  int result = PyObject_IsInstance(obj, uuid_UUID);
+  if (result < 0) {
+    PyErr_Clear();
+    return false;
+  }
+  return result != 0;
+}
+
+namespace {
+
 // Once initialized, these variables hold borrowed references to Pandas static 
data.
 // We should not use OwnedRef here because Python destructors would be
 // called on a finalized interpreter.
@@ -315,72 +368,43 @@ static PyObject* pandas_Timedelta = nullptr;
 static PyObject* pandas_Timestamp = nullptr;
 static PyTypeObject* pandas_NaTType = nullptr;
 static PyObject* pandas_DateOffset = nullptr;
+static ModuleOnceRunner pandas_runner("pandas");
 
-void GetPandasStaticSymbols() {
-  OwnedRef pandas;
-
-  // Import pandas
-  Status s = ImportModule("pandas", &pandas);
-  if (!s.ok()) {
-    return;
-  }
-
-#ifndef Py_GIL_DISABLED
-  // Since ImportModule can release the GIL, another thread could have
-  // already initialized the static data.
-  if (pandas_static_initialized) {
-    return;
-  }
-#endif
-
-  OwnedRef ref;
-
-  // set NaT sentinel and its type
-  if (ImportFromModule(pandas.obj(), "NaT", &ref).ok()) {
-    pandas_NaT = ref.obj();
-    // PyObject_Type returns a new reference but we trust that pandas.NaT will
-    // outlive our use of this PyObject*
-    pandas_NaTType = Py_TYPE(ref.obj());
-  }
-
-  // retain a reference to Timedelta
-  if (ImportFromModule(pandas.obj(), "Timedelta", &ref).ok()) {
-    pandas_Timedelta = ref.obj();
-  }
+}  // namespace
 
-  // retain a reference to Timestamp
-  if (ImportFromModule(pandas.obj(), "Timestamp", &ref).ok()) {
-    pandas_Timestamp = ref.obj();
-  }
+void InitPandasStaticData() {
+  pandas_runner.RunOnce([](OwnedRef& module) {
+    OwnedRef ref;
+
+    // set NaT sentinel and its type
+    if (ImportFromModule(module.obj(), "NaT", &ref).ok()) {
+      pandas_NaT = ref.obj();
+      // PyObject_Type returns a new reference but we trust that pandas.NaT 
will
+      // outlive our use of this PyObject*
+      pandas_NaTType = Py_TYPE(ref.obj());
+    }
 
-  // if pandas.NA exists, retain a reference to it
-  if (ImportFromModule(pandas.obj(), "NA", &ref).ok()) {
-    pandas_NA = ref.obj();
-  }
+    // retain a reference to Timedelta
+    if (ImportFromModule(module.obj(), "Timedelta", &ref).ok()) {
+      pandas_Timedelta = ref.obj();
+    }
 
-  // Import DateOffset type
-  if (ImportFromModule(pandas.obj(), "DateOffset", &ref).ok()) {
-    pandas_DateOffset = ref.obj();
-  }
-}
+    // retain a reference to Timestamp
+    if (ImportFromModule(module.obj(), "Timestamp", &ref).ok()) {
+      pandas_Timestamp = ref.obj();
+    }
 
-}  // namespace
+    // if pandas.NA exists, retain a reference to it
+    if (ImportFromModule(module.obj(), "NA", &ref).ok()) {
+      pandas_NA = ref.obj();
+    }
 
-#ifdef Py_GIL_DISABLED
-void InitPandasStaticData() {
-  std::call_once(pandas_static_initialized, GetPandasStaticSymbols);
-}
-#else
-void InitPandasStaticData() {
-  // NOTE: This is called with the GIL held.  We needn't (and shouldn't,
-  // to avoid deadlocks) use an additional C++ lock (ARROW-10519).
-  if (pandas_static_initialized) {
-    return;
-  }
-  GetPandasStaticSymbols();
-  pandas_static_initialized = true;
+    // Import DateOffset type
+    if (ImportFromModule(module.obj(), "DateOffset", &ref).ok()) {
+      pandas_DateOffset = ref.obj();
+    }
+  });
 }
-#endif
 
 bool PandasObjectIsNull(PyObject* obj) {
   if (!MayHaveNaN(obj)) {
diff --git a/python/pyarrow/src/arrow/python/helpers.h 
b/python/pyarrow/src/arrow/python/helpers.h
index b0cf101028..b4417a9644 100644
--- a/python/pyarrow/src/arrow/python/helpers.h
+++ b/python/pyarrow/src/arrow/python/helpers.h
@@ -92,6 +92,10 @@ PyObject* BorrowPandasDataOffsetType();
 ARROW_PYTHON_EXPORT
 bool PyFloat_IsNaN(PyObject* obj);
 
+// \brief Check whether obj is a uuid.UUID instance
+ARROW_PYTHON_EXPORT
+bool IsPyUuid(PyObject* obj);
+
 inline bool IsPyBinary(PyObject* obj) {
   return PyBytes_Check(obj) || PyByteArray_Check(obj) || 
PyMemoryView_Check(obj);
 }
diff --git a/python/pyarrow/src/arrow/python/inference.cc 
b/python/pyarrow/src/arrow/python/inference.cc
index 06cb469483..291cc42149 100644
--- a/python/pyarrow/src/arrow/python/inference.cc
+++ b/python/pyarrow/src/arrow/python/inference.cc
@@ -27,6 +27,7 @@
 #include <utility>
 #include <vector>
 
+#include "arrow/extension/uuid.h"
 #include "arrow/scalar.h"
 #include "arrow/status.h"
 #include "arrow/util/decimal.h"
@@ -407,6 +408,7 @@ class TypeInferrer {
         arrow_scalar_count_(0),
         numpy_dtype_count_(0),
         interval_count_(0),
+        uuid_count_(0),
         max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
                               std::numeric_limits<int32_t>::min()),
         decimal_type_() {
@@ -475,6 +477,9 @@ class TypeInferrer {
       ++decimal_count_;
     } else if (PyObject_IsInstance(obj, interval_types_.obj())) {
       ++interval_count_;
+    } else if (internal::IsPyUuid(obj)) {
+      ++uuid_count_;
+      *keep_going = make_unions_;
     } else {
       return internal::InvalidValue(obj,
                                     "did not recognize Python value type when 
inferring "
@@ -604,6 +609,8 @@ class TypeInferrer {
       *out = utf8();
     } else if (interval_count_) {
       *out = month_day_nano_interval();
+    } else if (uuid_count_) {
+      *out = extension::uuid();
     } else if (arrow_scalar_count_) {
       *out = scalar_type_;
     } else {
@@ -766,6 +773,7 @@ class TypeInferrer {
   int64_t arrow_scalar_count_;
   int64_t numpy_dtype_count_;
   int64_t interval_count_;
+  int64_t uuid_count_;
   std::unique_ptr<TypeInferrer> list_inferrer_;
   std::vector<std::pair<std::string, TypeInferrer>> struct_inferrers_;
   std::unordered_map<std::string, size_t> struct_field_index_;
diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc 
b/python/pyarrow/src/arrow/python/python_to_arrow.cc
index 139eb1d7f4..c70510a480 100644
--- a/python/pyarrow/src/arrow/python/python_to_arrow.cc
+++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc
@@ -36,6 +36,7 @@
 #include "arrow/array/builder_primitive.h"
 #include "arrow/array/builder_time.h"
 #include "arrow/chunked_array.h"
+#include "arrow/extension_type.h"
 #include "arrow/result.h"
 #include "arrow/scalar.h"
 #include "arrow/status.h"
@@ -512,7 +513,12 @@ class PyValue {
 
   static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
                         PyBytesView& view) {
-    ARROW_RETURN_NOT_OK(view.ParseString(obj));
+    // Check if obj is a uuid.UUID instance
+    if (internal::IsPyUuid(obj)) {
+      ARROW_RETURN_NOT_OK(view.ParseUuid(obj));
+    } else {
+      ARROW_RETURN_NOT_OK(view.ParseString(obj));
+    }
     if (view.size != type->byte_width()) {
       std::stringstream ss;
       ss << "expected to be length " << type->byte_width() << " was " << 
view.size;
@@ -1268,9 +1274,16 @@ Result<std::shared_ptr<ChunkedArray>> 
ConvertPySequence(PyObject* obj, PyObject*
   // In some cases, type inference may be "loose", like strings. If the user
   // passed pa.string(), then we will error if we encounter any non-UTF8
   // value. If not, then we will allow the result to be a BinaryArray
+  std::shared_ptr<DataType> extension_type;
   if (options.type == nullptr) {
     ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, 
options.from_pandas));
     options.strict = false;
+    // If type inference returned an extension type, convert using
+    // the storage type and then wrap the result as an extension array
+    if (options.type->id() == Type::EXTENSION) {
+      extension_type = options.type;
+      options.type = checked_cast<const 
ExtensionType&>(*options.type).storage_type();
+    }
   } else {
     options.strict = true;
   }
@@ -1278,6 +1291,7 @@ Result<std::shared_ptr<ChunkedArray>> 
ConvertPySequence(PyObject* obj, PyObject*
 
   ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter, 
PyConverterTrait>(
                                             options.type, options, pool)));
+  std::shared_ptr<ChunkedArray> result;
   if (converter->may_overflow()) {
     // The converter hierarchy contains binary- or list-like builders which 
can overflow
     // depending on the input values. Wrap the converter with a chunker which 
detects
@@ -1288,7 +1302,7 @@ Result<std::shared_ptr<ChunkedArray>> 
ConvertPySequence(PyObject* obj, PyObject*
     } else {
       RETURN_NOT_OK(chunked_converter->Extend(seq, size));
     }
-    return chunked_converter->ToChunkedArray();
+    ARROW_ASSIGN_OR_RAISE(result, chunked_converter->ToChunkedArray());
   } else {
     // If the converter can't overflow spare the capacity error checking on 
the hot-path,
     // this improves the performance roughly by ~10% for primitive types.
@@ -1297,8 +1311,13 @@ Result<std::shared_ptr<ChunkedArray>> 
ConvertPySequence(PyObject* obj, PyObject*
     } else {
       RETURN_NOT_OK(converter->Extend(seq, size));
     }
-    return converter->ToChunkedArray();
+    ARROW_ASSIGN_OR_RAISE(result, converter->ToChunkedArray());
+  }
+  // If we inferred an extension type, wrap as an extension array
+  if (extension_type != nullptr) {
+    return ExtensionType::WrapArray(extension_type, result);
   }
+  return result;
 }
 
 }  // namespace py
diff --git a/python/pyarrow/tests/test_extension_type.py 
b/python/pyarrow/tests/test_extension_type.py
index c947b06e0e..66fcfc0556 100644
--- a/python/pyarrow/tests/test_extension_type.py
+++ b/python/pyarrow/tests/test_extension_type.py
@@ -1399,6 +1399,93 @@ def test_uuid_extension():
     assert isinstance(array[0], pa.UuidScalar)
 
 
+def test_uuid_scalar_from_python():
+    # Test with explicit type
+    py_uuid = uuid4()
+    scalar = pa.scalar(py_uuid, type=pa.uuid())
+    assert isinstance(scalar, pa.UuidScalar)
+    assert scalar.type == pa.uuid()
+    assert scalar.as_py() == py_uuid
+
+    # Test with specific UUID value
+    specific_uuid = UUID("12345678-1234-5678-1234-567812345678")
+    scalar = pa.scalar(specific_uuid, type=pa.uuid())
+    assert scalar.as_py() == specific_uuid
+    assert scalar.value.as_py() == specific_uuid.bytes
+
+    scalar = pa.scalar(None, type=pa.uuid())
+    assert scalar.is_valid is False
+    assert scalar.as_py() is None
+
+    # Test type inference from uuid.UUID
+    py_uuid = uuid4()
+    scalar = pa.scalar(py_uuid)
+    assert isinstance(scalar, pa.UuidScalar)
+    assert scalar.type == pa.uuid()
+    assert scalar.as_py() == py_uuid
+
+
+def test_uuid_array_from_python():
+    # Test array with explicit type
+    uuids = [uuid4() for _ in range(3)]
+    uuids.append(None)
+
+    arr = pa.array(uuids, type=pa.uuid())
+    assert arr.type == pa.uuid()
+    assert len(arr) == 4
+    assert arr.null_count == 1
+    for i, u in enumerate(uuids):
+        assert arr[i].as_py() == u
+
+    # Test type inference for arrays
+    arr = pa.array(uuids)
+    assert arr.type == pa.uuid()
+    for i, u in enumerate(uuids):
+        assert arr[i].as_py() == u
+
+
[email protected]("bytes_value,exc_type,match", [
+    (b"0123456789abcde", pa.ArrowInvalid, "expected to be length 16 was 15"),
+    (
+        "0123456789abcdef", TypeError,
+        "Expected uuid.UUID.bytes to return bytes, got 'str'"
+    ),
+    (None, TypeError, "Expected uuid.UUID.bytes to return bytes, got 
'NoneType'"),
+])
+def test_uuid_bytes_property_not_bytes(bytes_value, exc_type, match):
+    class BadUuid(UUID):
+        @property
+        def bytes(self):
+            return bytes_value
+
+    bad = BadUuid(uuid4().hex)
+    with pytest.raises(exc_type, match=match):
+        pa.array([bad], type=pa.uuid())
+    with pytest.raises(exc_type, match=match):
+        pa.scalar(bad, type=pa.uuid())
+    with pytest.raises(exc_type, match=match):
+        pa.array([bad])
+    with pytest.raises(exc_type, match=match):
+        pa.scalar(bad)
+
+
+def test_uuid_bytes_property_raises():
+    class BadUuid(UUID):
+        @property
+        def bytes(self):
+            raise RuntimeError("broken")
+
+    bad = BadUuid(uuid4().hex)
+    with pytest.raises(RuntimeError, match="broken"):
+        pa.array([bad], type=pa.uuid())
+    with pytest.raises(RuntimeError, match="broken"):
+        pa.scalar(bad, type=pa.uuid())
+    with pytest.raises(RuntimeError, match="broken"):
+        pa.array([bad])
+    with pytest.raises(RuntimeError, match="broken"):
+        pa.scalar(bad)
+
+
 def test_tensor_type():
     tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
     assert tensor_type.extension_name == "arrow.fixed_shape_tensor"

Reply via email to