westonpace commented on code in PR #16345: URL: https://github.com/apache/datafusion/pull/16345#discussion_r2138834078
########## datafusion/substrait/src/logical_plan/consumer/utils.rs: ########## @@ -81,98 +81,167 @@ pub(super) fn next_struct_field_name( } } -pub(super) fn rename_field( +/// Traverse through the field, renaming the provided field itself and all its inner struct fields. +pub fn rename_field( field: &Field, dfs_names: &Vec<String>, unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll use this "c{unnamed_field_suffix}" name_idx: &mut usize, // Index into dfs_names - rename_self: bool, // Some fields (e.g. list items) don't have names in Substrait and this will be false to keep old name ) -> datafusion::common::Result<Field> { - let name = if rename_self { - next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)? - } else { - field.name().to_string() - }; - match field.data_type() { + let name = next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)?; + rename_fields_data_type(field.clone().with_name(name), dfs_names, name_idx) +} + +/// Rename the field's data type but not the field itself. +pub fn rename_fields_data_type( + field: Field, + dfs_names: &Vec<String>, + name_idx: &mut usize, // Index into dfs_names +) -> datafusion::common::Result<Field> { + let dt = rename_data_type(field.data_type(), dfs_names, name_idx)?; + Ok(field.with_data_type(dt)) +} + +/// Traverse through the data type (incl. lists/maps/etc), renaming all inner struct fields. +pub fn rename_data_type( + data_type: &DataType, + dfs_names: &Vec<String>, + name_idx: &mut usize, // Index into dfs_names +) -> datafusion::common::Result<DataType> { + match data_type { DataType::Struct(children) => { let children = children .iter() .enumerate() - .map(|(child_idx, f)| { - rename_field( - f.as_ref(), - dfs_names, - child_idx, - name_idx, - /*rename_self=*/ true, - ) + .map(|(field_idx, f)| { + rename_field(f.as_ref(), dfs_names, field_idx, name_idx) }) .collect::<datafusion::common::Result<_>>()?; - Ok(field - .to_owned() - .with_name(name) - .with_data_type(DataType::Struct(children))) + Ok(DataType::Struct(children)) } - DataType::List(inner) => { - let renamed_inner = rename_field( - inner.as_ref(), + DataType::List(inner) => Ok(DataType::List(Arc::new(rename_fields_data_type( + inner.as_ref().to_owned(), + dfs_names, + name_idx, + )?))), + DataType::LargeList(inner) => Ok(DataType::LargeList(Arc::new( + rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?, + ))), + DataType::ListView(inner) => Ok(DataType::ListView(Arc::new( + rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?, + ))), + DataType::LargeListView(inner) => Ok(DataType::LargeListView(Arc::new( + rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?, + ))), + DataType::FixedSizeList(inner, len) => Ok(DataType::FixedSizeList( + Arc::new(rename_fields_data_type( + inner.as_ref().to_owned(), dfs_names, - 0, name_idx, - /*rename_self=*/ false, - )?; - Ok(field - .to_owned() - .with_data_type(DataType::List(FieldRef::new(renamed_inner))) - .with_name(name)) + )?), + *len, + )), + DataType::Map(entries, sorted) => { + let entries_data_type = match entries.data_type() { + DataType::Struct(fields) => { + // This should be two fields, normally "key" and "value", but not guaranteed + let fields = fields + .iter() + .map(|f| { + rename_fields_data_type( + f.as_ref().to_owned(), + dfs_names, + name_idx, + ) + }) + .collect::<datafusion::common::Result<_>>()?; + Ok(DataType::Struct(fields)) + } + _ => exec_err!("Expected map type to contain an inner struct type"), + }?; + Ok(DataType::Map( + Arc::new( + entries + .as_ref() + .to_owned() + .with_data_type(entries_data_type), + ), + *sorted, + )) } - DataType::LargeList(inner) => { - let renamed_inner = rename_field( - inner.as_ref(), + DataType::Dictionary(key_type, value_type) => { + // Dicts probably shouldn't contain structs, but support them just in case one does + Ok(DataType::Dictionary( + Box::new(rename_data_type(key_type, dfs_names, name_idx)?), + Box::new(rename_data_type(value_type, dfs_names, name_idx)?), + )) + } + DataType::RunEndEncoded(run_ends_field, values_field) => { Review Comment: Dictionary, REE, ListView, LargeList, LargeListView, are all "encodings" and not "types" as far as Substrait is concerned. There is some discussion of this in https://github.com/apache/arrow/blob/main/format/substrait/extension_types.yaml There's nothing wrong with what you have here, and I think it's probably fine to keep it, but if I'm being pedantic I don't think a producer should ever produce a plan with these types. Encodings can be expressed as variations however, in which case they will use the base type plus a variation identifier. So maybe it is still possible to hit this path if you are parsing one of those. ########## datafusion/substrait/src/logical_plan/consumer/utils.rs: ########## @@ -81,98 +81,167 @@ pub(super) fn next_struct_field_name( } } -pub(super) fn rename_field( +/// Traverse through the field, renaming the provided field itself and all its inner struct fields. +pub fn rename_field( field: &Field, dfs_names: &Vec<String>, unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll use this "c{unnamed_field_suffix}" name_idx: &mut usize, // Index into dfs_names - rename_self: bool, // Some fields (e.g. list items) don't have names in Substrait and this will be false to keep old name ) -> datafusion::common::Result<Field> { - let name = if rename_self { - next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)? - } else { - field.name().to_string() - }; - match field.data_type() { + let name = next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)?; + rename_fields_data_type(field.clone().with_name(name), dfs_names, name_idx) +} + +/// Rename the field's data type but not the field itself. +pub fn rename_fields_data_type( + field: Field, + dfs_names: &Vec<String>, + name_idx: &mut usize, // Index into dfs_names +) -> datafusion::common::Result<Field> { + let dt = rename_data_type(field.data_type(), dfs_names, name_idx)?; + Ok(field.with_data_type(dt)) +} + +/// Traverse through the data type (incl. lists/maps/etc), renaming all inner struct fields. +pub fn rename_data_type( Review Comment: Do these two functions need to be `pub` (not a problem if yes, just curious)? ########## datafusion/substrait/src/logical_plan/consumer/utils.rs: ########## @@ -81,98 +81,167 @@ pub(super) fn next_struct_field_name( } } -pub(super) fn rename_field( +/// Traverse through the field, renaming the provided field itself and all its inner struct fields. +pub fn rename_field( field: &Field, dfs_names: &Vec<String>, unnamed_field_suffix: usize, // If Substrait doesn't provide a name, we'll use this "c{unnamed_field_suffix}" name_idx: &mut usize, // Index into dfs_names - rename_self: bool, // Some fields (e.g. list items) don't have names in Substrait and this will be false to keep old name ) -> datafusion::common::Result<Field> { - let name = if rename_self { - next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)? - } else { - field.name().to_string() - }; - match field.data_type() { + let name = next_struct_field_name(unnamed_field_suffix, dfs_names, name_idx)?; + rename_fields_data_type(field.clone().with_name(name), dfs_names, name_idx) +} + +/// Rename the field's data type but not the field itself. +pub fn rename_fields_data_type( + field: Field, + dfs_names: &Vec<String>, + name_idx: &mut usize, // Index into dfs_names +) -> datafusion::common::Result<Field> { + let dt = rename_data_type(field.data_type(), dfs_names, name_idx)?; + Ok(field.with_data_type(dt)) +} + +/// Traverse through the data type (incl. lists/maps/etc), renaming all inner struct fields. +pub fn rename_data_type( + data_type: &DataType, + dfs_names: &Vec<String>, + name_idx: &mut usize, // Index into dfs_names +) -> datafusion::common::Result<DataType> { + match data_type { DataType::Struct(children) => { let children = children .iter() .enumerate() - .map(|(child_idx, f)| { - rename_field( - f.as_ref(), - dfs_names, - child_idx, - name_idx, - /*rename_self=*/ true, - ) + .map(|(field_idx, f)| { + rename_field(f.as_ref(), dfs_names, field_idx, name_idx) }) .collect::<datafusion::common::Result<_>>()?; - Ok(field - .to_owned() - .with_name(name) - .with_data_type(DataType::Struct(children))) + Ok(DataType::Struct(children)) } - DataType::List(inner) => { - let renamed_inner = rename_field( - inner.as_ref(), + DataType::List(inner) => Ok(DataType::List(Arc::new(rename_fields_data_type( + inner.as_ref().to_owned(), + dfs_names, + name_idx, + )?))), + DataType::LargeList(inner) => Ok(DataType::LargeList(Arc::new( + rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?, + ))), + DataType::ListView(inner) => Ok(DataType::ListView(Arc::new( + rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?, + ))), + DataType::LargeListView(inner) => Ok(DataType::LargeListView(Arc::new( + rename_fields_data_type(inner.as_ref().to_owned(), dfs_names, name_idx)?, + ))), + DataType::FixedSizeList(inner, len) => Ok(DataType::FixedSizeList( + Arc::new(rename_fields_data_type( + inner.as_ref().to_owned(), dfs_names, - 0, name_idx, - /*rename_self=*/ false, - )?; - Ok(field - .to_owned() - .with_data_type(DataType::List(FieldRef::new(renamed_inner))) - .with_name(name)) + )?), + *len, + )), + DataType::Map(entries, sorted) => { + let entries_data_type = match entries.data_type() { + DataType::Struct(fields) => { + // This should be two fields, normally "key" and "value", but not guaranteed + let fields = fields Review Comment: Ah, I got a bit confused here but I think you're doing the right thing. You are _not_ pulling an item from `dfs_names` for the `key` or `value` entry which is correct. You will simply reuse whatever names those fields happen to have which is good. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org