alamb commented on code in PR #16333: URL: https://github.com/apache/datafusion/pull/16333#discussion_r2135784803
########## datafusion/functions-nested/src/array_has.rs: ########## @@ -232,98 +236,244 @@ fn array_has_inner_for_array(haystack: &ArrayRef, needle: &ArrayRef) -> Result<A match haystack.data_type() { DataType::List(_) => array_has_dispatch_for_array::<i32>(haystack, needle), DataType::LargeList(_) => array_has_dispatch_for_array::<i64>(haystack, needle), + DataType::FixedSizeList(_, _) => { + array_has_dispatch_for_fixed_size_list_array(haystack, needle) + } _ => exec_err!( "array_has does not support type '{:?}'.", haystack.data_type() ), } } +macro_rules! array_has_dispatch_for_array { + ($haystack:expr, $needle:expr) => {{ + let mut boolean_builder = BooleanArray::builder($haystack.len()); + + for (i, arr) in $haystack.iter().enumerate() { + if arr.is_none() || $needle.is_null(i) { + boolean_builder.append_null(); + continue; + } + let arr = arr.unwrap(); + let is_nested = arr.data_type().is_nested(); + let needle_row = Scalar::new($needle.slice(i, 1)); + let eq_array = compare_with_eq(&arr, &needle_row, is_nested)?; + boolean_builder.append_value(eq_array.true_count() > 0); + } + + Ok(Arc::new(boolean_builder.finish())) + }}; +} + +fn array_has_dispatch_for_fixed_size_list_array( + haystack: &ArrayRef, + needle: &ArrayRef, +) -> Result<ArrayRef> { + let haystack = as_fixed_size_list_array(haystack); + array_has_dispatch_for_array!(haystack, needle) +} + fn array_has_dispatch_for_array<O: OffsetSizeTrait>( haystack: &ArrayRef, needle: &ArrayRef, ) -> Result<ArrayRef> { let haystack = as_generic_list_array::<O>(haystack)?; - let mut boolean_builder = BooleanArray::builder(haystack.len()); + array_has_dispatch_for_array!(haystack, needle) +} - for (i, arr) in haystack.iter().enumerate() { - if arr.is_none() || needle.is_null(i) { - boolean_builder.append_null(); - continue; +macro_rules! array_has_dispatch_for_scalar { + ($haystack:expr, $offsets:expr, $needle:expr) => {{ + let values = $haystack.values(); + let is_nested = values.data_type().is_nested(); + // If first argument is empty list (second argument is non-null), return false + // i.e. array_has([], non-null element) -> false + if values.is_empty() { + return Ok(Arc::new(BooleanArray::new( + BooleanBuffer::new_unset($haystack.len()), + None, + ))); + } + let eq_array = compare_with_eq(values, $needle, is_nested)?; + let mut final_contained = vec![None; $haystack.len()]; + for (i, offset) in $offsets.windows(2).enumerate() { + let start = offset[0].to_usize().unwrap(); + let end = offset[1].to_usize().unwrap(); + let length = end - start; + // For non-nested list, length is 0 for null + if length == 0 { + continue; + } + let sliced_array = eq_array.slice(start, length); + final_contained[i] = Some(sliced_array.true_count() > 0); } - let arr = arr.unwrap(); - let is_nested = arr.data_type().is_nested(); - let needle_row = Scalar::new(needle.slice(i, 1)); - let eq_array = compare_with_eq(&arr, &needle_row, is_nested)?; - boolean_builder.append_value(eq_array.true_count() > 0); - } - Ok(Arc::new(boolean_builder.finish())) + Ok(Arc::new(BooleanArray::from(final_contained))) + }}; +} + +fn array_has_dispatch_for_fixed_size_list_scalar( + haystack: &ArrayRef, + needle: &dyn Datum, +) -> Result<ArrayRef> { + let haystack = as_fixed_size_list_array(haystack); + let value_length = haystack.value_length() as usize; + let offsets = (0_i32..haystack.len() as i32) + .step_by(value_length) + .collect::<Vec<_>>(); + array_has_dispatch_for_scalar!(haystack, offsets, needle) } fn array_has_dispatch_for_scalar<O: OffsetSizeTrait>( haystack: &ArrayRef, needle: &dyn Datum, ) -> Result<ArrayRef> { let haystack = as_generic_list_array::<O>(haystack)?; - let values = haystack.values(); - let is_nested = values.data_type().is_nested(); let offsets = haystack.value_offsets(); - // If first argument is empty list (second argument is non-null), return false - // i.e. array_has([], non-null element) -> false - if values.is_empty() { - return Ok(Arc::new(BooleanArray::new( - BooleanBuffer::new_unset(haystack.len()), - None, - ))); - } - let eq_array = compare_with_eq(values, needle, is_nested)?; - let mut final_contained = vec![None; haystack.len()]; - for (i, offset) in offsets.windows(2).enumerate() { - let start = offset[0].to_usize().unwrap(); - let end = offset[1].to_usize().unwrap(); - let length = end - start; - // For non-nested list, length is 0 for null - if length == 0 { - continue; + array_has_dispatch_for_scalar!(haystack, offsets, needle) +} + +fn array_has_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> { + array_has_all_and_any_inner(args, ComparisonType::All) +} + +// General row comparison for array_has_all and array_has_any +macro_rules! general_array_has_for_all_and_any { + ($haystack:expr, $needle:expr, $comparison_type:expr) => {{ + let mut boolean_builder = BooleanArray::builder($haystack.len()); + let converter = RowConverter::new(vec![SortField::new($haystack.value_type())])?; + + for (arr, sub_arr) in $haystack.iter().zip($needle.iter()) { + if let (Some(arr), Some(sub_arr)) = (arr, sub_arr) { + let arr_values = converter.convert_columns(&[arr])?; + let sub_arr_values = converter.convert_columns(&[sub_arr])?; + boolean_builder.append_value(general_array_has_all_and_any_kernel( + arr_values, + sub_arr_values, + $comparison_type, + )); + } else { + boolean_builder.append_null(); + } } - let sliced_array = eq_array.slice(start, length); - final_contained[i] = Some(sliced_array.true_count() > 0); - } - Ok(Arc::new(BooleanArray::from(final_contained))) + Ok(Arc::new(boolean_builder.finish())) + }}; } -fn array_has_all_inner(args: &[ArrayRef]) -> Result<ArrayRef> { - match args[0].data_type() { - DataType::List(_) => { - array_has_all_and_any_dispatch::<i32>(&args[0], &args[1], ComparisonType::All) +// String comparison for array_has_all and array_has_any +macro_rules! array_has_all_and_any_string_internal { + ($haystack:expr, $needle:expr, $comparison_type:expr) => {{ + let mut boolean_builder = BooleanArray::builder($haystack.len()); + for (arr, sub_arr) in $haystack.iter().zip($needle.iter()) { + match (arr, sub_arr) { + (Some(arr), Some(sub_arr)) => { + let haystack_array = string_array_to_vec(&arr); + let needle_array = string_array_to_vec(&sub_arr); + boolean_builder.append_value(array_has_string_kernel( + haystack_array, + needle_array, + $comparison_type, + )); + } + (_, _) => { + boolean_builder.append_null(); + } + } } - DataType::LargeList(_) => { - array_has_all_and_any_dispatch::<i64>(&args[0], &args[1], ComparisonType::All) + + Ok(Arc::new(boolean_builder.finish())) + }}; +} + +macro_rules! array_has_all_and_any_dispatch { + ($haystack:expr, $needle:expr, $comparison_type:expr) => { + if $needle.values().is_empty() { + let buffer = match $comparison_type { + ComparisonType::All => BooleanBuffer::new_set($haystack.len()), + ComparisonType::Any => BooleanBuffer::new_unset($haystack.len()), + }; + Ok(Arc::new(BooleanArray::from(buffer))) + } else { + match $needle.data_type() { + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => { + array_has_all_and_any_string_internal!( + $haystack, + $needle, + $comparison_type + ) + } + _ => general_array_has_for_all_and_any!( + $haystack, + $needle, + $comparison_type + ), + } } - _ => exec_err!( - "array_has does not support type '{:?}'.", - args[0].data_type() - ), - } + }; } -fn array_has_any_inner(args: &[ArrayRef]) -> Result<ArrayRef> { - match args[0].data_type() { - DataType::List(_) => { - array_has_all_and_any_dispatch::<i32>(&args[0], &args[1], ComparisonType::Any) +fn array_has_all_and_any_inner( + args: &[ArrayRef], + comparison_type: ComparisonType, +) -> Result<ArrayRef> { + match (args[0].data_type(), args[1].data_type()) { Review Comment: This will generate quite a few different copies of the code (code bloat) -- and I am not sure how important it is to create a specialized version of each type of comparison Would it be possible instead to update the coercion logic so the argments are always cast to the same type (it might already be this way, I didn't double check) before `array_has_all` is called? ########## datafusion/functions-nested/src/array_has.rs: ########## @@ -232,98 +236,244 @@ fn array_has_inner_for_array(haystack: &ArrayRef, needle: &ArrayRef) -> Result<A match haystack.data_type() { DataType::List(_) => array_has_dispatch_for_array::<i32>(haystack, needle), DataType::LargeList(_) => array_has_dispatch_for_array::<i64>(haystack, needle), + DataType::FixedSizeList(_, _) => { + array_has_dispatch_for_fixed_size_list_array(haystack, needle) + } _ => exec_err!( "array_has does not support type '{:?}'.", haystack.data_type() ), } } +macro_rules! array_has_dispatch_for_array { Review Comment: I find macros a bit harder to deal with than normal functions Can you please add some documentation to these macros explaining what they do (and in particular, what the expectations on `$haystack` and `$needle` are)? Specifically, they look like they are supposed to be an array and scalar perhaps? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org