alamb commented on code in PR #16345: URL: https://github.com/apache/datafusion/pull/16345#discussion_r2138161484
########## datafusion/substrait/src/logical_plan/consumer/utils.rs: ########## @@ -81,98 +81,167 @@ pub(super) fn next_struct_field_name( } } -pub(super) fn rename_field( +/// Traverse through the field, renaming the provided field itself and all its inner struct fields. +pub fn rename_field( field: &Field, dfs_names: &Vec<String>, unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll use this "c{unnamed_field_suffix}" Review Comment: I recommend moving the comments explaining the arguments into the docstring (`///` style comments) so it appears in the public documentation now that this function is `pub` Like ```rust /// Traverse through the field, renaming the provided field itself and all its inner struct fields. /// /// # Arguments /// - The field being renamed /// - dfs_names: ... /// - `unnamed_field_suffix`: If Substrait doesn't provide a name, we'll use this "c{unnamed_field_suffix}" pub fn rename_field( field: &Field, ... ``` ########## datafusion/substrait/src/logical_plan/consumer/utils.rs: ########## @@ -81,98 +81,167 @@ pub(super) fn next_struct_field_name( } } -pub(super) fn rename_field( +/// Traverse through the field, renaming the provided field itself and all its inner struct fields. +pub fn rename_field( field: &Field, dfs_names: &Vec<String>, unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll use this "c{unnamed_field_suffix}" name_idx: &mut usize, // Index into dfs_names - rename_self: bool, // Some fields (e.g. list items) don't have names in Substrait and this will be false to keep old name ) -> datafusion::common::Result<Field> { - let name = if rename_self { - next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)? - } else { - field.name().to_string() - }; - match field.data_type() { + let name = next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)?; + rename_fields_data_type(field.clone().with_name(name), dfs_names, name_idx) +} + +/// Rename the field's data type but not the field itself. +pub fn rename_fields_data_type( + field: Field, + dfs_names: &Vec<String>, + name_idx: &mut usize, // Index into dfs_names +) -> datafusion::common::Result<Field> { + let dt = rename_data_type(field.data_type(), dfs_names, name_idx)?; + Ok(field.with_data_type(dt)) +} + +/// Traverse through the data type (incl. lists/maps/etc), renaming all inner struct fields. +pub fn rename_data_type( + data_type: &DataType, + dfs_names: &Vec<String>, + name_idx: &mut usize, // Index into dfs_names +) -> datafusion::common::Result<DataType> { + match data_type { Review Comment: This pattern of visiting / rewriting DataTypes seems like it might be cooler to break out into its own method / function. Not in this PR, but I wonder how many other walks over the DataTypes there are / could use the same structuer ########## datafusion/substrait/src/logical_plan/consumer/utils.rs: ########## @@ -81,98 +81,167 @@ pub(super) fn next_struct_field_name( } } -pub(super) fn rename_field( +/// Traverse through the field, renaming the provided field itself and all its inner struct fields. +pub fn rename_field( field: &Field, dfs_names: &Vec<String>, unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll use this "c{unnamed_field_suffix}" name_idx: &mut usize, // Index into dfs_names - rename_self: bool, // Some fields (e.g. list items) don't have names in Substrait and this will be false to keep old name ) -> datafusion::common::Result<Field> { - let name = if rename_self { - next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)? - } else { - field.name().to_string() - }; - match field.data_type() { + let name = next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)?; + rename_fields_data_type(field.clone().with_name(name), dfs_names, name_idx) +} + +/// Rename the field's data type but not the field itself. +pub fn rename_fields_data_type( + field: Field, + dfs_names: &Vec<String>, + name_idx: &mut usize, // Index into dfs_names +) -> datafusion::common::Result<Field> { + let dt = rename_data_type(field.data_type(), dfs_names, name_idx)?; + Ok(field.with_data_type(dt)) +} + +/// Traverse through the data type (incl. lists/maps/etc), renaming all inner struct fields. +pub fn rename_data_type( + data_type: &DataType, + dfs_names: &Vec<String>, + name_idx: &mut usize, // Index into dfs_names +) -> datafusion::common::Result<DataType> { + match data_type { DataType::Struct(children) => { let children = children .iter() .enumerate() - .map(|(child_idx, f)| { - rename_field( - f.as_ref(), - dfs_names, - child_idx, - name_idx, - /*rename_self=*/ true, - ) + .map(|(field_idx, f)| { + rename_field(f.as_ref(), dfs_names, field_idx, name_idx) }) .collect::<datafusion::common::Result<_>>()?; - Ok(field - .to_owned() - .with_name(name) - .with_data_type(DataType::Struct(children))) + Ok(DataType::Struct(children)) } - DataType::List(inner) => { - let renamed_inner = rename_field( - inner.as_ref(), + DataType::List(inner) => Ok(DataType::List(Arc::new(rename_fields_data_type( + inner.as_ref().to_owned(), + dfs_names, + name_idx, + )?))), + DataType::LargeList(inner) => Ok(DataType::LargeList(Arc::new( + rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?, + ))), + DataType::ListView(inner) => Ok(DataType::ListView(Arc::new( + rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?, + ))), + DataType::LargeListView(inner) => Ok(DataType::LargeListView(Arc::new( + rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?, + ))), + DataType::FixedSizeList(inner, len) => Ok(DataType::FixedSizeList( + Arc::new(rename_fields_data_type( + inner.as_ref().to_owned(), dfs_names, - 0, name_idx, - /*rename_self=*/ false, - )?; - Ok(field - .to_owned() - .with_data_type(DataType::List(FieldRef::new(renamed_inner))) - .with_name(name)) + )?), + *len, + )), + DataType::Map(entries, sorted) => { + let entries_data_type = match entries.data_type() { + DataType::Struct(fields) => { + // This should be two fields, normally "key" and "value", but not guaranteed + let fields = fields + .iter() + .map(|f| { + rename_fields_data_type( + f.as_ref().to_owned(), + dfs_names, + name_idx, + ) + }) + .collect::<datafusion::common::Result<_>>()?; + Ok(DataType::Struct(fields)) + } + _ => exec_err!("Expected map type to contain an inner struct type"), + }?; + Ok(DataType::Map( + Arc::new( + entries + .as_ref() + .to_owned() + .with_data_type(entries_data_type), + ), + *sorted, + )) } - DataType::LargeList(inner) => { - let renamed_inner = rename_field( - inner.as_ref(), + DataType::Dictionary(key_type, value_type) => { + // Dicts probably shouldn't contain structs, but support them just in case one does + Ok(DataType::Dictionary( + Box::new(rename_data_type(key_type, dfs_names, name_idx)?), + Box::new(rename_data_type(value_type, dfs_names, name_idx)?), + )) + } + DataType::RunEndEncoded(run_ends_field, values_field) => { + // At least the run_ends_field shouldn't contain names (since it should be i16/i32/i64), + // but we'll try renaming its datatype just in case. + let run_ends_field = rename_fields_data_type( + run_ends_field.as_ref().clone(), + dfs_names, + name_idx, + )?; + let values_field = rename_fields_data_type( + values_field.as_ref().clone(), dfs_names, - 0, name_idx, - /*rename_self= */ false, )?; - Ok(field - .to_owned() - .with_data_type(DataType::LargeList(FieldRef::new(renamed_inner))) - .with_name(name)) + + Ok(DataType::RunEndEncoded( + Arc::new(run_ends_field), + Arc::new(values_field), + )) } - DataType::Map(inner, sorted) => match inner.data_type() { - DataType::Struct(key_and_value) if key_and_value.len() == 2 => { - let renamed_keys = rename_field( - key_and_value[0].as_ref(), - dfs_names, - 0, - name_idx, - /*rename_self=*/ false, - )?; - let renamed_values = rename_field( - key_and_value[1].as_ref(), - dfs_names, - 0, - name_idx, - /*rename_self=*/ false, - )?; - Ok(field - .to_owned() - .with_data_type(DataType::Map( - Arc::new(Field::new( - inner.name(), - DataType::Struct(Fields::from(vec![ - renamed_keys, - renamed_values, - ])), - inner.is_nullable(), - )), - *sorted, + DataType::Union(fields, mode) => { + let fields = fields + .iter() + .map(|(i, f)| { + Ok(( + i, + Arc::new(rename_fields_data_type( + f.as_ref().clone(), + dfs_names, + name_idx, + )?), )) - .with_name(name)) - } - _ => substrait_err!("Map fields must contain a Struct with exactly 2 fields"), - }, - _ => Ok(field.to_owned().with_name(name)), + }) + .collect::<datafusion::common::Result<UnionFields>>()?; + Ok(DataType::Union(fields, *mode)) + } + // Explicitly listing the rest (which can not contain inner fields needing renaming) Review Comment: 👍 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org