This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 7465fe312 fix: Fix `space()` with negative input (#3347)
7465fe312 is described below
commit 7465fe3126ff64075dc64bd63c8b9eb7598b568f
Author: hsiang-c <[email protected]>
AuthorDate: Fri Jan 30 10:42:08 2026 -0800
fix: Fix `space()` with negative input (#3347)
* fix: handle negative lengths in string_space expression
The string_space function previously failed with overflow when given
negative input values. Now treats negative lengths as zero, returning
empty strings.
Co-Authored-By: Claude Sonnet 4.5 <[email protected]>
* Fix a typo
* Call max() once; more comments
---------
Co-authored-by: Claude Sonnet 4.5 <[email protected]>
---
native/spark-expr/src/string_funcs/string_space.rs | 32 ++++++++++++++++++++--
native/spark-expr/src/string_funcs/substring.rs | 2 +-
.../apache/comet/CometStringExpressionSuite.scala | 2 +-
3 files changed, 31 insertions(+), 5 deletions(-)
diff --git a/native/spark-expr/src/string_funcs/string_space.rs
b/native/spark-expr/src/string_funcs/string_space.rs
index 00b880730..4ab536279 100644
--- a/native/spark-expr/src/string_funcs/string_space.rs
+++ b/native/spark-expr/src/string_funcs/string_space.rs
@@ -119,9 +119,14 @@ fn generic_string_space<OffsetSize:
OffsetSizeTrait>(length: &Int32Array) -> Arr
let null_bit_buffer = length.to_data().nulls().map(|b| b.buffer().clone());
// Gets slice of length array to access it directly for performance.
+ // Negative length values are set to zero to match Spark behavior
let length_data = length.to_data();
- let lengths = length_data.buffers()[0].typed_data::<i32>();
- let total = lengths.iter().map(|l| *l as usize).sum::<usize>();
+ let lengths: Vec<_> = length_data.buffers()[0]
+ .typed_data::<i32>()
+ .iter()
+ .map(|l| (*l).max(0) as usize)
+ .collect();
+ let total = lengths.iter().sum::<usize>();
let mut values = MutableBuffer::new(total);
offsets.push(length_so_far);
@@ -130,7 +135,7 @@ fn generic_string_space<OffsetSize:
OffsetSizeTrait>(length: &Int32Array) -> Arr
values.resize(total, blank);
(0..array_len).for_each(|i| {
- let current_len = lengths[i] as usize;
+ let current_len = lengths[i];
length_so_far += OffsetSize::from_usize(current_len).unwrap();
offsets.push(length_so_far);
@@ -149,3 +154,24 @@ fn generic_string_space<OffsetSize:
OffsetSizeTrait>(length: &Int32Array) -> Arr
};
make_array(data)
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use arrow::array::StringArray;
+ use datafusion::common::cast::as_string_array;
+
+ #[test]
+ fn test_negative_length() {
+ let input = Int32Array::from(vec![Some(-1), Some(-2), None]);
+ let args = ColumnarValue::Array(Arc::new(input));
+ match spark_string_space(&[args]) {
+ Ok(ColumnarValue::Array(result)) => {
+ let actual = as_string_array(&result).unwrap();
+ let expected = StringArray::from(vec![Some(""), Some(""),
None]);
+ assert_eq!(actual, &expected)
+ }
+ _ => unreachable!(),
+ }
+ }
+}
diff --git a/native/spark-expr/src/string_funcs/substring.rs
b/native/spark-expr/src/string_funcs/substring.rs
index f0ecb4e7e..5037a6e06 100644
--- a/native/spark-expr/src/string_funcs/substring.rs
+++ b/native/spark-expr/src/string_funcs/substring.rs
@@ -61,7 +61,7 @@ impl Display for SubstringExpr {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(
f,
- "StringSpace [start: {}, len: {}, child: {}]",
+ "Substring [start: {}, len: {}, child: {}]",
self.start, self.len, self.child
)
}
diff --git
a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala
b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala
index f9882780c..662534b3c 100644
--- a/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/CometStringExpressionSuite.scala
@@ -376,7 +376,7 @@ class CometStringExpressionSuite extends CometTestBase {
}
test("string_space") {
- withParquetTable((0 until 5).map(i => (i, i + 1)), "tbl") {
+ withParquetTable((0 until 5).map(i => (-i, i + 1)), "tbl") {
checkSparkAnswerAndOperator("SELECT space(_1), space(_2) FROM tbl")
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]