Jefffrey commented on code in PR #18198:
URL: https://github.com/apache/datafusion/pull/18198#discussion_r2448015895
##########
datafusion/functions-nested/src/range.rs:
##########
@@ -53,13 +56,23 @@ make_udf_expr_and_func!(
range,
start stop step,
"create a list of values in the range between start and stop",
- range_udf
+ range_udf,
+ Range::new
+);
+
+make_udf_expr_and_func!(
+ GenSeries,
+ gen_series,
+ start stop step,
+ "create a list of values in the range between start and stop, include
upper bound",
+ gen_series_udf,
+ Range::generate_series
Review Comment:
Custom constructor here
##########
datafusion/functions-nested/src/macros.rs:
##########
@@ -41,10 +41,15 @@
/// * `arg`: 0 or more named arguments for the function
/// * `DOC`: documentation string for the function
/// * `SCALAR_UDF_FUNC`: name of the function to create (just) the `ScalarUDF`
+/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it
+/// automatically resolves to `$UDF::new()`.
///
/// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl
macro_rules! make_udf_expr_and_func {
- ($UDF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr ,
$SCALAR_UDF_FN:ident) => {
+ ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr,
$SCALAR_UDF_FN:ident) => {
+ make_udf_expr_and_func!($UDF, $EXPR_FN, $($arg)*, $DOC,
$SCALAR_UDF_FN, $UDF::new);
+ };
+ ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr,
$SCALAR_UDF_FN:ident, $CTOR:path) => {
Review Comment:
Just like what window functions do:
https://github.com/apache/datafusion/blob/155b56e521d75186776a65f1634ee03058899a79/datafusion/functions-window/src/macros.rs#L96-L105
So we can use same `Range` struct with different constructor methods
##########
datafusion/functions-nested/src/range.rs:
##########
@@ -286,107 +221,263 @@ impl ScalarUDFImpl for GenSeries {
let args = &args.args;
if args.iter().any(|arg| arg.data_type().is_null()) {
- return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1))));
+ return Ok(ColumnarValue::Scalar(ScalarValue::Null));
}
match args[0].data_type() {
- Int64 => make_scalar_function(|args| gen_range_inner(args,
true))(args),
- Date32 => make_scalar_function(|args| gen_range_date(args,
true))(args),
+ Int64 => make_scalar_function(|args|
self.gen_range_inner(args))(args),
+ Date32 => make_scalar_function(|args|
self.gen_range_date(args))(args),
Timestamp(_, _) => {
- make_scalar_function(|args| gen_range_timestamp(args,
true))(args)
+ make_scalar_function(|args|
self.gen_range_timestamp(args))(args)
}
dt => {
- exec_err!(
- "unsupported type for GENERATE_SERIES. Expected Int64,
Date32 or Timestamp, got: {}",
- dt
- )
+ exec_err!("unsupported type for {}. Expected Int64, Date32 or
Timestamp, got: {dt}", self.name())
}
}
}
- fn aliases(&self) -> &[String] {
- &self.aliases
- }
-
fn documentation(&self) -> Option<&Documentation> {
- self.doc()
+ if self.include_upper_bound {
+ GenerateSeriesDoc {}.doc()
+ } else {
+ RangeDoc {}.doc()
+ }
}
}
-/// Generates an array of integers from start to stop with a given step.
-///
-/// This function takes 1 to 3 ArrayRefs as arguments, representing start,
stop, and step values.
-/// It returns a `Result<ArrayRef>` representing the resulting ListArray after
the operation.
-///
-/// # Arguments
-///
-/// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and
step(step value can not be zero.) values.
-///
-/// # Examples
-///
-/// gen_range(3) => [0, 1, 2]
-/// gen_range(1, 4) => [1, 2, 3]
-/// gen_range(1, 7, 2) => [1, 3, 5]
-pub(super) fn gen_range_inner(
Review Comment:
Pulled these into the `impl Range` so they can easily access `self.name()`
and `self.include_upper_bound`
##########
datafusion/functions-nested/src/range.rs:
##########
@@ -286,107 +221,263 @@ impl ScalarUDFImpl for GenSeries {
let args = &args.args;
if args.iter().any(|arg| arg.data_type().is_null()) {
- return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1))));
+ return Ok(ColumnarValue::Scalar(ScalarValue::Null));
}
match args[0].data_type() {
- Int64 => make_scalar_function(|args| gen_range_inner(args,
true))(args),
- Date32 => make_scalar_function(|args| gen_range_date(args,
true))(args),
+ Int64 => make_scalar_function(|args|
self.gen_range_inner(args))(args),
+ Date32 => make_scalar_function(|args|
self.gen_range_date(args))(args),
Timestamp(_, _) => {
- make_scalar_function(|args| gen_range_timestamp(args,
true))(args)
+ make_scalar_function(|args|
self.gen_range_timestamp(args))(args)
}
dt => {
- exec_err!(
- "unsupported type for GENERATE_SERIES. Expected Int64,
Date32 or Timestamp, got: {}",
- dt
- )
+ exec_err!("unsupported type for {}. Expected Int64, Date32 or
Timestamp, got: {dt}", self.name())
}
}
}
- fn aliases(&self) -> &[String] {
- &self.aliases
- }
-
fn documentation(&self) -> Option<&Documentation> {
- self.doc()
+ if self.include_upper_bound {
+ GenerateSeriesDoc {}.doc()
+ } else {
+ RangeDoc {}.doc()
+ }
}
}
-/// Generates an array of integers from start to stop with a given step.
-///
-/// This function takes 1 to 3 ArrayRefs as arguments, representing start,
stop, and step values.
-/// It returns a `Result<ArrayRef>` representing the resulting ListArray after
the operation.
-///
-/// # Arguments
-///
-/// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and
step(step value can not be zero.) values.
-///
-/// # Examples
-///
-/// gen_range(3) => [0, 1, 2]
-/// gen_range(1, 4) => [1, 2, 3]
-/// gen_range(1, 7, 2) => [1, 3, 5]
-pub(super) fn gen_range_inner(
- args: &[ArrayRef],
- include_upper: bool,
-) -> Result<ArrayRef> {
- let (start_array, stop_array, step_array) = match args.len() {
- 1 => (None, as_int64_array(&args[0])?, None),
- 2 => (
- Some(as_int64_array(&args[0])?),
- as_int64_array(&args[1])?,
- None,
- ),
- 3 => (
- Some(as_int64_array(&args[0])?),
- as_int64_array(&args[1])?,
- Some(as_int64_array(&args[2])?),
- ),
- _ => return exec_err!("gen_range expects 1 to 3 arguments"),
- };
-
- let mut values = vec![];
- let mut offsets = vec![0];
- let mut valid = NullBufferBuilder::new(stop_array.len());
- for (idx, stop) in stop_array.iter().enumerate() {
- match retrieve_range_args(start_array, stop, step_array, idx) {
- Some((_, _, 0)) => {
- return exec_err!(
- "step can't be 0 for function {}(start [, stop, step])",
- if include_upper {
- "generate_series"
- } else {
- "range"
- }
- );
+impl Range {
+ /// Generates an array of integers from start to stop with a given step.
+ ///
+ /// This function takes 1 to 3 ArrayRefs as arguments, representing start,
stop, and step values.
+ /// It returns a `Result<ArrayRef>` representing the resulting ListArray
after the operation.
+ ///
+ /// # Arguments
+ ///
+ /// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and
step(step value can not be zero.) values.
+ ///
+ /// # Examples
+ ///
+ /// gen_range(3) => [0, 1, 2]
+ /// gen_range(1, 4) => [1, 2, 3]
+ /// gen_range(1, 7, 2) => [1, 3, 5]
+ fn gen_range_inner(&self, args: &[ArrayRef]) -> Result<ArrayRef> {
+ let (start_array, stop_array, step_array) = match args {
+ [stop_array] => (None, as_int64_array(stop_array)?, None),
+ [start_array, stop_array] => (
+ Some(as_int64_array(start_array)?),
+ as_int64_array(stop_array)?,
+ None,
+ ),
+ [start_array, stop_array, step_array] => (
+ Some(as_int64_array(start_array)?),
+ as_int64_array(stop_array)?,
+ Some(as_int64_array(step_array)?),
+ ),
+ _ => return exec_err!("{} expects 1 to 3 arguments", self.name()),
+ };
+
+ let mut values = vec![];
+ let mut offsets = vec![0];
+ let mut valid = NullBufferBuilder::new(stop_array.len());
+ for (idx, stop) in stop_array.iter().enumerate() {
+ match retrieve_range_args(start_array, stop, step_array, idx) {
+ Some((_, _, 0)) => {
+ return exec_err!(
+ "step can't be 0 for function {}(start [, stop,
step])",
+ self.name()
+ );
+ }
+ Some((start, stop, step)) => {
+ // Below, we utilize `usize` to represent steps.
+ // On 32-bit targets, the absolute value of `i64` may fail
to fit into `usize`.
+ let step_abs =
+ usize::try_from(step.unsigned_abs()).map_err(|_| {
+ not_impl_datafusion_err!("step {} can't fit into
usize", step)
+ })?;
+ values.extend(
+ gen_range_iter(start, stop, step < 0,
self.include_upper_bound)
+ .step_by(step_abs),
+ );
+ offsets.push(values.len() as i32);
+ valid.append_non_null();
+ }
+ // If any of the arguments is NULL, append a NULL value to the
result.
+ None => {
+ offsets.push(values.len() as i32);
+ valid.append_null();
+ }
+ };
+ }
+ let arr = Arc::new(ListArray::try_new(
+ Arc::new(Field::new_list_field(Int64, true)),
+ OffsetBuffer::new(offsets.into()),
+ Arc::new(Int64Array::from(values)),
+ valid.finish(),
+ )?);
+ Ok(arr)
+ }
+
+ fn gen_range_date(&self, args: &[ArrayRef]) -> Result<ArrayRef> {
+ let [start, stop, step] = take_function_args(self.name(), args)?;
+
+ let (start_array, stop_array, step_array) = (
+ as_date32_array(start)?,
+ as_date32_array(stop)?,
+ as_interval_mdn_array(step)?,
+ );
+
+ // values are date32s
+ let values_builder = Date32Builder::new();
+ let mut list_builder = ListBuilder::new(values_builder);
+
+ for idx in 0..stop_array.len() {
+ if start_array.is_null(idx)
+ || stop_array.is_null(idx)
+ || step_array.is_null(idx)
+ {
+ list_builder.append_null();
+ continue;
}
- Some((start, stop, step)) => {
- // Below, we utilize `usize` to represent steps.
- // On 32-bit targets, the absolute value of `i64` may fail to
fit into `usize`.
- let step_abs =
usize::try_from(step.unsigned_abs()).map_err(|_| {
- not_impl_datafusion_err!("step {} can't fit into usize",
step)
- })?;
- values.extend(
- gen_range_iter(start, stop, step < 0, include_upper)
- .step_by(step_abs),
- );
- offsets.push(values.len() as i32);
- valid.append_non_null();
+
+ let start = start_array.value(idx);
+ let stop = stop_array.value(idx);
+ let step = step_array.value(idx);
+
+ let (months, days, _) = IntervalMonthDayNanoType::to_parts(step);
+ if months == 0 && days == 0 {
+ return exec_err!("Cannot generate date range less than 1
day.");
+ }
+
+ let stop = if !self.include_upper_bound {
+ Date32Type::subtract_month_day_nano(stop, step)
+ } else {
+ stop
+ };
+
+ let neg = months < 0 || days < 0;
+ let mut new_date = start;
+
+ let values = from_fn(|| {
+ if (neg && new_date < stop) || (!neg && new_date > stop) {
+ None
+ } else {
+ let current_date = new_date;
+ new_date = Date32Type::add_month_day_nano(new_date, step);
+ Some(Some(current_date))
+ }
+ });
+
+ list_builder.append_value(values);
+ }
+
+ let arr = Arc::new(list_builder.finish());
+
+ Ok(arr)
+ }
+
+ fn gen_range_timestamp(&self, args: &[ArrayRef]) -> Result<ArrayRef> {
+ let [start, stop, step] = take_function_args(self.name(), args)?;
+
+ // coerce_types fn should coerce all types to Timestamp(Nanosecond, tz)
+ // TODO: remove these map_err once the signature is robust enough to
guard against this
Review Comment:
Looking to do this for #15881 itself
##########
datafusion/functions-nested/src/range.rs:
##########
@@ -88,115 +101,12 @@ make_udf_expr_and_func!(
description = "Increase by step (cannot be 0). Steps less than a day
are supported only for timestamp ranges."
)
)]
-#[derive(Debug, PartialEq, Eq, Hash)]
-pub struct Range {
- signature: Signature,
- aliases: Vec<String>,
-}
-
-impl Default for Range {
- fn default() -> Self {
- Self::new()
- }
-}
-impl Range {
- pub fn new() -> Self {
- Self {
- signature: Signature::user_defined(Volatility::Immutable),
- aliases: vec![],
- }
- }
-}
-impl ScalarUDFImpl for Range {
- fn as_any(&self) -> &dyn Any {
- self
- }
- fn name(&self) -> &str {
- "range"
- }
-
- fn signature(&self) -> &Signature {
- &self.signature
- }
-
- fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
- arg_types
- .iter()
- .map(|arg_type| match arg_type {
- Null => Ok(Null),
- Int8 => Ok(Int64),
- Int16 => Ok(Int64),
- Int32 => Ok(Int64),
- Int64 => Ok(Int64),
- UInt8 => Ok(Int64),
- UInt16 => Ok(Int64),
- UInt32 => Ok(Int64),
- UInt64 => Ok(Int64),
- Timestamp(_, tz) => Ok(Timestamp(Nanosecond, tz.clone())),
- Date32 => Ok(Date32),
- Date64 => Ok(Date32),
- Utf8 => Ok(Date32),
- LargeUtf8 => Ok(Date32),
- Utf8View => Ok(Date32),
- Interval(_) => Ok(Interval(MonthDayNano)),
- _ => exec_err!("Unsupported DataType"),
- })
- .try_collect()
- }
-
- fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
- if arg_types.iter().any(|t| t.is_null()) {
- Ok(Null)
- } else {
- Ok(List(Arc::new(Field::new_list_field(
- arg_types[0].clone(),
- true,
- ))))
- }
- }
-
- fn invoke_with_args(
- &self,
- args: datafusion_expr::ScalarFunctionArgs,
- ) -> Result<ColumnarValue> {
- let args = &args.args;
-
- if args.iter().any(|arg| arg.data_type().is_null()) {
- return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1))));
- }
- match args[0].data_type() {
- Int64 => make_scalar_function(|args| gen_range_inner(args,
false))(args),
- Date32 => make_scalar_function(|args| gen_range_date(args,
false))(args),
- Timestamp(_, _) => {
- make_scalar_function(|args| gen_range_timestamp(args,
false))(args)
- }
- dt => {
- exec_err!("unsupported type for RANGE. Expected Int64, Date32
or Timestamp, got: {dt}")
- }
- }
- }
-
- fn aliases(&self) -> &[String] {
- &self.aliases
- }
-
- fn documentation(&self) -> Option<&Documentation> {
- self.doc()
- }
-}
-
-make_udf_expr_and_func!(
- GenSeries,
- gen_series,
- start stop step,
- "create a list of values in the range between start and stop, include
upper bound",
- gen_series_udf
-);
+struct RangeDoc {}
Review Comment:
Empty struct just so I can use the `user_doc` attribute; I was going to do
it like this:
https://github.com/apache/datafusion/blob/155b56e521d75186776a65f1634ee03058899a79/datafusion/functions-window/src/rank.rs#L107-L140
But I prefer using the `user_doc` attribute as it looks nicer imo; happy to
switch if this isn't recommended
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]