Jefffrey commented on code in PR #18198:
URL: https://github.com/apache/datafusion/pull/18198#discussion_r2448015895


##########
datafusion/functions-nested/src/range.rs:
##########
@@ -53,13 +56,23 @@ make_udf_expr_and_func!(
     range,
     start stop step,
     "create a list of values in the range between start and stop",
-    range_udf
+    range_udf,
+    Range::new
+);
+
+make_udf_expr_and_func!(
+    GenSeries,
+    gen_series,
+    start stop step,
+    "create a list of values in the range between start and stop, include 
upper bound",
+    gen_series_udf,
+    Range::generate_series

Review Comment:
   Custom constructor here



##########
datafusion/functions-nested/src/macros.rs:
##########
@@ -41,10 +41,15 @@
 /// * `arg`: 0 or more named arguments for the function
 /// * `DOC`: documentation string for the function
 /// * `SCALAR_UDF_FUNC`: name of the function to create (just) the `ScalarUDF`
+/// * (optional) `$CTOR`: Pass a custom constructor. When omitted it
+///   automatically resolves to `$UDF::new()`.
 ///
 /// [`ScalarUDFImpl`]: datafusion_expr::ScalarUDFImpl
 macro_rules! make_udf_expr_and_func {
-    ($UDF:ty, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr , 
$SCALAR_UDF_FN:ident) => {
+    ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, 
$SCALAR_UDF_FN:ident) => {
+        make_udf_expr_and_func!($UDF, $EXPR_FN, $($arg)*, $DOC, 
$SCALAR_UDF_FN, $UDF::new);
+    };
+    ($UDF:ident, $EXPR_FN:ident, $($arg:ident)*, $DOC:expr, 
$SCALAR_UDF_FN:ident, $CTOR:path) => {

Review Comment:
   Just like what window functions do:
   
   
https://github.com/apache/datafusion/blob/155b56e521d75186776a65f1634ee03058899a79/datafusion/functions-window/src/macros.rs#L96-L105
   
   So we can use same `Range` struct with different constructor methods



##########
datafusion/functions-nested/src/range.rs:
##########
@@ -286,107 +221,263 @@ impl ScalarUDFImpl for GenSeries {
         let args = &args.args;
 
         if args.iter().any(|arg| arg.data_type().is_null()) {
-            return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1))));
+            return Ok(ColumnarValue::Scalar(ScalarValue::Null));
         }
         match args[0].data_type() {
-            Int64 => make_scalar_function(|args| gen_range_inner(args, 
true))(args),
-            Date32 => make_scalar_function(|args| gen_range_date(args, 
true))(args),
+            Int64 => make_scalar_function(|args| 
self.gen_range_inner(args))(args),
+            Date32 => make_scalar_function(|args| 
self.gen_range_date(args))(args),
             Timestamp(_, _) => {
-                make_scalar_function(|args| gen_range_timestamp(args, 
true))(args)
+                make_scalar_function(|args| 
self.gen_range_timestamp(args))(args)
             }
             dt => {
-                exec_err!(
-                    "unsupported type for GENERATE_SERIES. Expected Int64, 
Date32 or Timestamp, got: {}",
-                    dt
-                )
+                exec_err!("unsupported type for {}. Expected Int64, Date32 or 
Timestamp, got: {dt}", self.name())
             }
         }
     }
 
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
     fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
+        if self.include_upper_bound {
+            GenerateSeriesDoc {}.doc()
+        } else {
+            RangeDoc {}.doc()
+        }
     }
 }
 
-/// Generates an array of integers from start to stop with a given step.
-///
-/// This function takes 1 to 3 ArrayRefs as arguments, representing start, 
stop, and step values.
-/// It returns a `Result<ArrayRef>` representing the resulting ListArray after 
the operation.
-///
-/// # Arguments
-///
-/// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and 
step(step value can not be zero.) values.
-///
-/// # Examples
-///
-/// gen_range(3) => [0, 1, 2]
-/// gen_range(1, 4) => [1, 2, 3]
-/// gen_range(1, 7, 2) => [1, 3, 5]
-pub(super) fn gen_range_inner(

Review Comment:
   Pulled these into the `impl Range` so they can easily access `self.name()` 
and `self.include_upper_bound`



##########
datafusion/functions-nested/src/range.rs:
##########
@@ -286,107 +221,263 @@ impl ScalarUDFImpl for GenSeries {
         let args = &args.args;
 
         if args.iter().any(|arg| arg.data_type().is_null()) {
-            return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1))));
+            return Ok(ColumnarValue::Scalar(ScalarValue::Null));
         }
         match args[0].data_type() {
-            Int64 => make_scalar_function(|args| gen_range_inner(args, 
true))(args),
-            Date32 => make_scalar_function(|args| gen_range_date(args, 
true))(args),
+            Int64 => make_scalar_function(|args| 
self.gen_range_inner(args))(args),
+            Date32 => make_scalar_function(|args| 
self.gen_range_date(args))(args),
             Timestamp(_, _) => {
-                make_scalar_function(|args| gen_range_timestamp(args, 
true))(args)
+                make_scalar_function(|args| 
self.gen_range_timestamp(args))(args)
             }
             dt => {
-                exec_err!(
-                    "unsupported type for GENERATE_SERIES. Expected Int64, 
Date32 or Timestamp, got: {}",
-                    dt
-                )
+                exec_err!("unsupported type for {}. Expected Int64, Date32 or 
Timestamp, got: {dt}", self.name())
             }
         }
     }
 
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
     fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
+        if self.include_upper_bound {
+            GenerateSeriesDoc {}.doc()
+        } else {
+            RangeDoc {}.doc()
+        }
     }
 }
 
-/// Generates an array of integers from start to stop with a given step.
-///
-/// This function takes 1 to 3 ArrayRefs as arguments, representing start, 
stop, and step values.
-/// It returns a `Result<ArrayRef>` representing the resulting ListArray after 
the operation.
-///
-/// # Arguments
-///
-/// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and 
step(step value can not be zero.) values.
-///
-/// # Examples
-///
-/// gen_range(3) => [0, 1, 2]
-/// gen_range(1, 4) => [1, 2, 3]
-/// gen_range(1, 7, 2) => [1, 3, 5]
-pub(super) fn gen_range_inner(
-    args: &[ArrayRef],
-    include_upper: bool,
-) -> Result<ArrayRef> {
-    let (start_array, stop_array, step_array) = match args.len() {
-        1 => (None, as_int64_array(&args[0])?, None),
-        2 => (
-            Some(as_int64_array(&args[0])?),
-            as_int64_array(&args[1])?,
-            None,
-        ),
-        3 => (
-            Some(as_int64_array(&args[0])?),
-            as_int64_array(&args[1])?,
-            Some(as_int64_array(&args[2])?),
-        ),
-        _ => return exec_err!("gen_range expects 1 to 3 arguments"),
-    };
-
-    let mut values = vec![];
-    let mut offsets = vec![0];
-    let mut valid = NullBufferBuilder::new(stop_array.len());
-    for (idx, stop) in stop_array.iter().enumerate() {
-        match retrieve_range_args(start_array, stop, step_array, idx) {
-            Some((_, _, 0)) => {
-                return exec_err!(
-                    "step can't be 0 for function {}(start [, stop, step])",
-                    if include_upper {
-                        "generate_series"
-                    } else {
-                        "range"
-                    }
-                );
+impl Range {
+    /// Generates an array of integers from start to stop with a given step.
+    ///
+    /// This function takes 1 to 3 ArrayRefs as arguments, representing start, 
stop, and step values.
+    /// It returns a `Result<ArrayRef>` representing the resulting ListArray 
after the operation.
+    ///
+    /// # Arguments
+    ///
+    /// * `args` - An array of 1 to 3 ArrayRefs representing start, stop, and 
step(step value can not be zero.) values.
+    ///
+    /// # Examples
+    ///
+    /// gen_range(3) => [0, 1, 2]
+    /// gen_range(1, 4) => [1, 2, 3]
+    /// gen_range(1, 7, 2) => [1, 3, 5]
+    fn gen_range_inner(&self, args: &[ArrayRef]) -> Result<ArrayRef> {
+        let (start_array, stop_array, step_array) = match args {
+            [stop_array] => (None, as_int64_array(stop_array)?, None),
+            [start_array, stop_array] => (
+                Some(as_int64_array(start_array)?),
+                as_int64_array(stop_array)?,
+                None,
+            ),
+            [start_array, stop_array, step_array] => (
+                Some(as_int64_array(start_array)?),
+                as_int64_array(stop_array)?,
+                Some(as_int64_array(step_array)?),
+            ),
+            _ => return exec_err!("{} expects 1 to 3 arguments", self.name()),
+        };
+
+        let mut values = vec![];
+        let mut offsets = vec![0];
+        let mut valid = NullBufferBuilder::new(stop_array.len());
+        for (idx, stop) in stop_array.iter().enumerate() {
+            match retrieve_range_args(start_array, stop, step_array, idx) {
+                Some((_, _, 0)) => {
+                    return exec_err!(
+                        "step can't be 0 for function {}(start [, stop, 
step])",
+                        self.name()
+                    );
+                }
+                Some((start, stop, step)) => {
+                    // Below, we utilize `usize` to represent steps.
+                    // On 32-bit targets, the absolute value of `i64` may fail 
to fit into `usize`.
+                    let step_abs =
+                        usize::try_from(step.unsigned_abs()).map_err(|_| {
+                            not_impl_datafusion_err!("step {} can't fit into 
usize", step)
+                        })?;
+                    values.extend(
+                        gen_range_iter(start, stop, step < 0, 
self.include_upper_bound)
+                            .step_by(step_abs),
+                    );
+                    offsets.push(values.len() as i32);
+                    valid.append_non_null();
+                }
+                // If any of the arguments is NULL, append a NULL value to the 
result.
+                None => {
+                    offsets.push(values.len() as i32);
+                    valid.append_null();
+                }
+            };
+        }
+        let arr = Arc::new(ListArray::try_new(
+            Arc::new(Field::new_list_field(Int64, true)),
+            OffsetBuffer::new(offsets.into()),
+            Arc::new(Int64Array::from(values)),
+            valid.finish(),
+        )?);
+        Ok(arr)
+    }
+
+    fn gen_range_date(&self, args: &[ArrayRef]) -> Result<ArrayRef> {
+        let [start, stop, step] = take_function_args(self.name(), args)?;
+
+        let (start_array, stop_array, step_array) = (
+            as_date32_array(start)?,
+            as_date32_array(stop)?,
+            as_interval_mdn_array(step)?,
+        );
+
+        // values are date32s
+        let values_builder = Date32Builder::new();
+        let mut list_builder = ListBuilder::new(values_builder);
+
+        for idx in 0..stop_array.len() {
+            if start_array.is_null(idx)
+                || stop_array.is_null(idx)
+                || step_array.is_null(idx)
+            {
+                list_builder.append_null();
+                continue;
             }
-            Some((start, stop, step)) => {
-                // Below, we utilize `usize` to represent steps.
-                // On 32-bit targets, the absolute value of `i64` may fail to 
fit into `usize`.
-                let step_abs = 
usize::try_from(step.unsigned_abs()).map_err(|_| {
-                    not_impl_datafusion_err!("step {} can't fit into usize", 
step)
-                })?;
-                values.extend(
-                    gen_range_iter(start, stop, step < 0, include_upper)
-                        .step_by(step_abs),
-                );
-                offsets.push(values.len() as i32);
-                valid.append_non_null();
+
+            let start = start_array.value(idx);
+            let stop = stop_array.value(idx);
+            let step = step_array.value(idx);
+
+            let (months, days, _) = IntervalMonthDayNanoType::to_parts(step);
+            if months == 0 && days == 0 {
+                return exec_err!("Cannot generate date range less than 1 
day.");
+            }
+
+            let stop = if !self.include_upper_bound {
+                Date32Type::subtract_month_day_nano(stop, step)
+            } else {
+                stop
+            };
+
+            let neg = months < 0 || days < 0;
+            let mut new_date = start;
+
+            let values = from_fn(|| {
+                if (neg && new_date < stop) || (!neg && new_date > stop) {
+                    None
+                } else {
+                    let current_date = new_date;
+                    new_date = Date32Type::add_month_day_nano(new_date, step);
+                    Some(Some(current_date))
+                }
+            });
+
+            list_builder.append_value(values);
+        }
+
+        let arr = Arc::new(list_builder.finish());
+
+        Ok(arr)
+    }
+
+    fn gen_range_timestamp(&self, args: &[ArrayRef]) -> Result<ArrayRef> {
+        let [start, stop, step] = take_function_args(self.name(), args)?;
+
+        // coerce_types fn should coerce all types to Timestamp(Nanosecond, tz)
+        // TODO: remove these map_err once the signature is robust enough to 
guard against this

Review Comment:
   Looking to do this for #15881 itself



##########
datafusion/functions-nested/src/range.rs:
##########
@@ -88,115 +101,12 @@ make_udf_expr_and_func!(
         description = "Increase by step (cannot be 0). Steps less than a day 
are supported only for timestamp ranges."
     )
 )]
-#[derive(Debug, PartialEq, Eq, Hash)]
-pub struct Range {
-    signature: Signature,
-    aliases: Vec<String>,
-}
-
-impl Default for Range {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-impl Range {
-    pub fn new() -> Self {
-        Self {
-            signature: Signature::user_defined(Volatility::Immutable),
-            aliases: vec![],
-        }
-    }
-}
-impl ScalarUDFImpl for Range {
-    fn as_any(&self) -> &dyn Any {
-        self
-    }
-    fn name(&self) -> &str {
-        "range"
-    }
-
-    fn signature(&self) -> &Signature {
-        &self.signature
-    }
-
-    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
-        arg_types
-            .iter()
-            .map(|arg_type| match arg_type {
-                Null => Ok(Null),
-                Int8 => Ok(Int64),
-                Int16 => Ok(Int64),
-                Int32 => Ok(Int64),
-                Int64 => Ok(Int64),
-                UInt8 => Ok(Int64),
-                UInt16 => Ok(Int64),
-                UInt32 => Ok(Int64),
-                UInt64 => Ok(Int64),
-                Timestamp(_, tz) => Ok(Timestamp(Nanosecond, tz.clone())),
-                Date32 => Ok(Date32),
-                Date64 => Ok(Date32),
-                Utf8 => Ok(Date32),
-                LargeUtf8 => Ok(Date32),
-                Utf8View => Ok(Date32),
-                Interval(_) => Ok(Interval(MonthDayNano)),
-                _ => exec_err!("Unsupported DataType"),
-            })
-            .try_collect()
-    }
-
-    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
-        if arg_types.iter().any(|t| t.is_null()) {
-            Ok(Null)
-        } else {
-            Ok(List(Arc::new(Field::new_list_field(
-                arg_types[0].clone(),
-                true,
-            ))))
-        }
-    }
-
-    fn invoke_with_args(
-        &self,
-        args: datafusion_expr::ScalarFunctionArgs,
-    ) -> Result<ColumnarValue> {
-        let args = &args.args;
-
-        if args.iter().any(|arg| arg.data_type().is_null()) {
-            return Ok(ColumnarValue::Array(Arc::new(NullArray::new(1))));
-        }
-        match args[0].data_type() {
-            Int64 => make_scalar_function(|args| gen_range_inner(args, 
false))(args),
-            Date32 => make_scalar_function(|args| gen_range_date(args, 
false))(args),
-            Timestamp(_, _) => {
-                make_scalar_function(|args| gen_range_timestamp(args, 
false))(args)
-            }
-            dt => {
-                exec_err!("unsupported type for RANGE. Expected Int64, Date32 
or Timestamp, got: {dt}")
-            }
-        }
-    }
-
-    fn aliases(&self) -> &[String] {
-        &self.aliases
-    }
-
-    fn documentation(&self) -> Option<&Documentation> {
-        self.doc()
-    }
-}
-
-make_udf_expr_and_func!(
-    GenSeries,
-    gen_series,
-    start stop step,
-    "create a list of values in the range between start and stop, include 
upper bound",
-    gen_series_udf
-);
+struct RangeDoc {}

Review Comment:
   Empty struct just so I can use the `user_doc` attribute; I was going to do 
it like this:
   
   
https://github.com/apache/datafusion/blob/155b56e521d75186776a65f1634ee03058899a79/datafusion/functions-window/src/rank.rs#L107-L140
   
   But I prefer using the `user_doc` attribute as it looks nicer imo; happy to 
switch if this isn't recommended



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to