This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e559dd0 ARROW-9440: [Python] Expose Fill Null kernel
e559dd0 is described below
commit e559dd080a27875bab3d5cdb0da115c62e2f60bb
Author: c-jamie <[email protected]>
AuthorDate: Mon Jul 13 19:53:47 2020 -0500
ARROW-9440: [Python] Expose Fill Null kernel
Closes #7736 from c-jamie/ARROW-9440
Lead-authored-by: c-jamie <[email protected]>
Co-authored-by: Wes McKinney <[email protected]>
Signed-off-by: Wes McKinney <[email protected]>
---
python/pyarrow/array.pxi | 6 ++++
python/pyarrow/compute.py | 41 +++++++++++++++++++++++
python/pyarrow/includes/libarrow.pxd | 1 +
python/pyarrow/scalar.pxi | 13 ++++++++
python/pyarrow/table.pxi | 6 ++++
python/pyarrow/tests/test_compute.py | 63 ++++++++++++++++++++++++++++++++++++
python/pyarrow/tests/test_scalars.py | 9 ++++++
7 files changed, 139 insertions(+)
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index 1cffd37..1dcff02 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -1004,6 +1004,12 @@ cdef class Array(_PandasConvertible):
"""
return _pc().is_valid(self)
+ def fill_null(self, fill_value):
+ """
+ See pyarrow.compute.fill_null for usage.
+ """
+ return _pc().fill_null(self, fill_value)
+
def __getitem__(self, key):
"""
Slice or return value at given index
diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
index c8443ed..b8e678f 100644
--- a/python/pyarrow/compute.py
+++ b/python/pyarrow/compute.py
@@ -24,6 +24,7 @@ from pyarrow._compute import ( # noqa
call_function,
TakeOptions
)
+import pyarrow as pa
import pyarrow._compute as _pc
@@ -259,3 +260,43 @@ def take(data, indices, boundscheck=True):
"""
options = TakeOptions(boundscheck)
return call_function('take', [data, indices], options)
+
+
+def fill_null(values, fill_value):
+ """
+ Replace each null element in values with fill_value. The fill_value must be
+ the same type as values or able to be implicitly casted to the array's
+ type.
+
+ Parameters
+ ----------
+ data : Array, ChunkedArray
+ replace each null element with fill_value
+ fill_value: Scalar-like object
+ Either a pyarrow.Scalar or any python object coercible to a
+ Scalar. If not same type as data will attempt to cast.
+
+ Returns
+ -------
+ result : depends on inputs
+
+ Examples
+ --------
+ >>> import pyarrow as pa
+ >>> arr = pa.array([1, 2, None, 3], type=pa.int8())
+ >>> fill_value = pa.scalar(5, type=pa.int8())
+ >>> arr.fill_null(fill_value)
+ pyarrow.lib.Int8Array object at 0x7f95437f01a0>
+ [
+ 1,
+ 2,
+ 5,
+ 3
+ ]
+ """
+ if not isinstance(fill_value, pa.Scalar):
+ fill_value = pa.scalar(fill_value, type=values.type)
+ elif values.type != fill_value.type:
+ fill_value = pa.scalar(fill_value.as_py(), type=values.type)
+
+ return call_function("fill_null", [values, fill_value])
diff --git a/python/pyarrow/includes/libarrow.pxd
b/python/pyarrow/includes/libarrow.pxd
index 213ef24..c8e7c5b 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -887,6 +887,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_bool is_valid
c_string ToString() const
c_bool Equals(const CScalar& other) const
+ CResult[shared_ptr[CScalar]] CastTo(shared_ptr[CDataType] to) const
cdef cppclass CScalarHash" arrow::Scalar::Hash":
size_t operator()(const shared_ptr[CScalar]& scalar) const
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 903faae..248d926 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -63,6 +63,19 @@ cdef class Scalar:
"""
return self.wrapped.get().is_valid
+ def cast(self, object target_type):
+ """
+ Attempt a safe cast to target data type.
+ """
+ cdef:
+ DataType type = ensure_type(target_type)
+ shared_ptr[CScalar] result
+
+ with nogil:
+ result = GetResultValue(self.wrapped.get().CastTo(type.sp_type))
+
+ return Scalar.wrap(result)
+
def __repr__(self):
return '<pyarrow.{}: {!r}>'.format(
self.__class__.__name__, self.as_py()
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 08e3f75..688d668 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -191,6 +191,12 @@ cdef class ChunkedArray(_PandasConvertible):
except TypeError:
return NotImplemented
+ def fill_null(self, fill_value):
+ """
+ See pyarrow.compute.fill_null docstring for usage.
+ """
+ return _pc().fill_null(self, fill_value)
+
def equals(self, ChunkedArray other):
"""
Return whether the contents of two chunked arrays are equal.
diff --git a/python/pyarrow/tests/test_compute.py
b/python/pyarrow/tests/test_compute.py
index ca30a82..59f004f 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -484,3 +484,66 @@ def test_is_null():
result = arr.is_valid()
expected = pa.chunked_array([[True, True], [True, False]])
assert result.equals(expected)
+
+
+def test_fill_null():
+ arr = pa.array([1, 2, None, 4], type=pa.int8())
+ fill_value = pa.array([5], type=pa.int8())
+ with pytest.raises(TypeError):
+ arr.fill_null(fill_value)
+
+ arr = pa.array([None, None, None, None], type=pa.null())
+ fill_value = pa.scalar(None, type=pa.null())
+ result = arr.fill_null(fill_value)
+ expected = pa.array([None, None, None, None])
+ assert result.equals(expected)
+
+
[email protected]('arrow_type', numerical_arrow_types)
+def test_fill_null_array(arrow_type):
+ arr = pa.array([1, 2, None, 4], type=arrow_type)
+ fill_value = pa.scalar(5, type=arrow_type)
+ result = arr.fill_null(fill_value)
+ expected = pa.array([1, 2, 5, 4], type=arrow_type)
+ assert result.equals(expected)
+
+ # Implicit conversions
+ result = arr.fill_null(5)
+ assert result.equals(expected)
+
+ # ARROW-9451: Unsigned integers allow this for some reason
+ if not pa.types.is_unsigned_integer(arr.type):
+ with pytest.raises((ValueError, TypeError)):
+ arr.fill_null('5')
+
+ result = arr.fill_null(pa.scalar(5, type='int8'))
+ assert result.equals(expected)
+
+
[email protected]('arrow_type', numerical_arrow_types)
+def test_fill_null_chunked_array(arrow_type):
+ fill_value = pa.scalar(5, type=arrow_type)
+ arr = pa.chunked_array([pa.array([None, 2, 3, 4], type=arrow_type)])
+ result = arr.fill_null(fill_value)
+ expected = pa.chunked_array([pa.array([5, 2, 3, 4], type=arrow_type)])
+ assert result.equals(expected)
+
+ arr = pa.chunked_array([
+ pa.array([1, 2], type=arrow_type),
+ pa.array([], type=arrow_type),
+ pa.array([None, 4], type=arrow_type)
+ ])
+ expected = pa.chunked_array([
+ pa.array([1, 2], type=arrow_type),
+ pa.array([], type=arrow_type),
+ pa.array([5, 4], type=arrow_type)
+ ])
+ result = arr.fill_null(fill_value)
+ assert result.equals(expected)
+
+ # Implicit conversions
+ result = arr.fill_null(5)
+ assert result.equals(expected)
+
+ result = arr.fill_null(pa.scalar(5, type='int8'))
+ assert result.equals(expected)
diff --git a/python/pyarrow/tests/test_scalars.py
b/python/pyarrow/tests/test_scalars.py
index 81b2c3f..8a778bf 100644
--- a/python/pyarrow/tests/test_scalars.py
+++ b/python/pyarrow/tests/test_scalars.py
@@ -181,6 +181,15 @@ def test_time():
assert s.as_py() == t
+def test_cast():
+ val = pa.scalar(5, type='int8')
+ assert val.cast('int64') == pa.scalar(5, type='int64')
+ assert val.cast('uint32') == pa.scalar(5, type='uint32')
+ assert val.cast('string') == pa.scalar('5', type='string')
+ with pytest.raises(ValueError):
+ pa.scalar('foo').cast('int32')
+
+
@pytest.mark.pandas
def test_timestamp():
import pandas as pd