This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new fc98d5c282 feat: Implement Spark `bitmap_bucket_number` function
(#20288)
fc98d5c282 is described below
commit fc98d5c2826949060ce8ecaddcc0f2113e603c74
Author: Kazantsev Maksim <[email protected]>
AuthorDate: Sat Feb 21 06:08:44 2026 +0400
feat: Implement Spark `bitmap_bucket_number` function (#20288)
## Which issue does this PR close?
N/A
## Rationale for this change
Add new function:
https://spark.apache.org/docs/latest/api/sql/index.html#bitmap_bucket_number
## What changes are included in this PR?
- Implementation
- Unit Tests
- SLT tests
## Are these changes tested?
Yes, tests added as part of this PR.
## Are there any user-facing changes?
No, these are new function.
---------
Co-authored-by: Kazantsev Maksim <[email protected]>
---
.../src/function/bitmap/bitmap_bucket_number.rs | 141 +++++++++++++++++++++
datafusion/spark/src/function/bitmap/mod.rs | 16 ++-
.../spark/bitmap/bitmap_bucket_number.slt | 122 ++++++++++++++++++
3 files changed, 278 insertions(+), 1 deletion(-)
diff --git a/datafusion/spark/src/function/bitmap/bitmap_bucket_number.rs
b/datafusion/spark/src/function/bitmap/bitmap_bucket_number.rs
new file mode 100644
index 0000000000..fe72a4fe8a
--- /dev/null
+++ b/datafusion/spark/src/function/bitmap/bitmap_bucket_number.rs
@@ -0,0 +1,141 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, AsArray, Int64Array};
+use arrow::datatypes::Field;
+use arrow::datatypes::{DataType, FieldRef, Int8Type, Int16Type, Int32Type,
Int64Type};
+use datafusion::logical_expr::{ColumnarValue, Signature, TypeSignature,
Volatility};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, internal_err};
+use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `bitmap_bucket_number` expression
+///
<https://spark.apache.org/docs/latest/api/sql/index.html#bitmap_bucket_number>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct BitmapBucketNumber {
+ signature: Signature,
+}
+
+impl Default for BitmapBucketNumber {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl BitmapBucketNumber {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::one_of(
+ vec![
+ TypeSignature::Exact(vec![DataType::Int8]),
+ TypeSignature::Exact(vec![DataType::Int16]),
+ TypeSignature::Exact(vec![DataType::Int32]),
+ TypeSignature::Exact(vec![DataType::Int64]),
+ ],
+ Volatility::Immutable,
+ ),
+ }
+ }
+}
+
+impl ScalarUDFImpl for BitmapBucketNumber {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "bitmap_bucket_number"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+ internal_err!("return_field_from_args should be used instead")
+ }
+
+ fn return_field_from_args(
+ &self,
+ args: datafusion_expr::ReturnFieldArgs,
+ ) -> Result<FieldRef> {
+ Ok(Arc::new(Field::new(
+ self.name(),
+ DataType::Int64,
+ args.arg_fields[0].is_nullable(),
+ )))
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ make_scalar_function(bitmap_bucket_number_inner, vec![])(&args.args)
+ }
+}
+
+pub fn bitmap_bucket_number_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+ let [array] = take_function_args("bitmap_bucket_number", arg)?;
+ match &array.data_type() {
+ DataType::Int8 => {
+ let result: Int64Array = array
+ .as_primitive::<Int8Type>()
+ .iter()
+ .map(|opt| opt.map(|value| bitmap_bucket_number(value.into())))
+ .collect();
+ Ok(Arc::new(result))
+ }
+ DataType::Int16 => {
+ let result: Int64Array = array
+ .as_primitive::<Int16Type>()
+ .iter()
+ .map(|opt| opt.map(|value| bitmap_bucket_number(value.into())))
+ .collect();
+ Ok(Arc::new(result))
+ }
+ DataType::Int32 => {
+ let result: Int64Array = array
+ .as_primitive::<Int32Type>()
+ .iter()
+ .map(|opt| opt.map(|value| bitmap_bucket_number(value.into())))
+ .collect();
+ Ok(Arc::new(result))
+ }
+ DataType::Int64 => {
+ let result: Int64Array = array
+ .as_primitive::<Int64Type>()
+ .iter()
+ .map(|opt| opt.map(bitmap_bucket_number))
+ .collect();
+ Ok(Arc::new(result))
+ }
+ data_type => {
+ internal_err!("bitmap_bucket_number does not support {data_type}")
+ }
+ }
+}
+
+const NUM_BYTES: i64 = 4 * 1024;
+const NUM_BITS: i64 = NUM_BYTES * 8;
+
+fn bitmap_bucket_number(value: i64) -> i64 {
+ if value > 0 {
+ 1 + (value - 1) / NUM_BITS
+ } else {
+ value / NUM_BITS
+ }
+}
diff --git a/datafusion/spark/src/function/bitmap/mod.rs
b/datafusion/spark/src/function/bitmap/mod.rs
index 1a7dce02db..4992992aea 100644
--- a/datafusion/spark/src/function/bitmap/mod.rs
+++ b/datafusion/spark/src/function/bitmap/mod.rs
@@ -16,6 +16,7 @@
// under the License.
pub mod bitmap_bit_position;
+pub mod bitmap_bucket_number;
pub mod bitmap_count;
use datafusion_expr::ScalarUDF;
@@ -24,6 +25,10 @@ use std::sync::Arc;
make_udf_function!(bitmap_count::BitmapCount, bitmap_count);
make_udf_function!(bitmap_bit_position::BitmapBitPosition,
bitmap_bit_position);
+make_udf_function!(
+ bitmap_bucket_number::BitmapBucketNumber,
+ bitmap_bucket_number
+);
pub mod expr_fn {
use datafusion_functions::export_functions;
@@ -38,8 +43,17 @@ pub mod expr_fn {
"Returns the bit position for the given input child expression.",
arg
));
+ export_functions!((
+ bitmap_bucket_number,
+ "Returns the bucket number for the given input child expression.",
+ arg
+ ));
}
pub fn functions() -> Vec<Arc<ScalarUDF>> {
- vec![bitmap_count(), bitmap_bit_position()]
+ vec![
+ bitmap_count(),
+ bitmap_bit_position(),
+ bitmap_bucket_number(),
+ ]
}
diff --git
a/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bucket_number.slt
b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bucket_number.slt
new file mode 100644
index 0000000000..2a6e190b31
--- /dev/null
+++ b/datafusion/sqllogictest/test_files/spark/bitmap/bitmap_bucket_number.slt
@@ -0,0 +1,122 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int8'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(127, 'Int8'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-64, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-65, 'Int8'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(257, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(32767, 'Int16'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int16'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-256, 'Int16'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int32'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(65537, 'Int32'));
+----
+3
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(2147483647, 'Int32'));
+----
+65536
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int32'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-65536, 'Int32'));
+----
+-2
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(1, 'Int64'));
+----
+1
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(4294967297, 'Int64'));
+----
+131073
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(9223372036854775807, 'Int64'));
+----
+281474976710656
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-1, 'Int64'));
+----
+0
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-4294967296, 'Int64'));
+----
+-131072
+
+query I
+SELECT bitmap_bucket_number(arrow_cast(-9223372036854775808, 'Int64'));
+----
+-281474976710656
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]