Istvan Szukacs created ARROW-7866: ------------------------------------- Summary: [Rust] How to handle aggregates with Datafusion? Key: ARROW-7866 URL: https://issues.apache.org/jira/browse/ARROW-7866 Project: Apache Arrow Issue Type: Bug Components: Rust - DataFusion Affects Versions: 0.16.0 Reporter: Istvan Szukacs
Hi, I am trying to use the Rust tools and I was running into this interesting behavior. {code} use std::sync::Arc; extern crate arrow; extern crate datafusion; use arrow::array::\{StringArray, UInt32Array}; use arrow::datatypes::\{DataType, Field, Schema}; use datafusion::error::Result; use datafusion::execution::context::ExecutionContext; fn main() -> Result<()> { // create local execution context let mut ctx = ExecutionContext::new(); // define schema for data source (csv file) let schema = Arc::new(Schema::new(vec![ Field::new("index", DataType::UInt32, false), Field::new("tablename", DataType::Utf8, false), Field::new("datetime", DataType::Utf8, false), Field::new("partition", DataType::Utf8, false), Field::new("fileSize", DataType::UInt32, false), ])); // register csv file with the execution context ctx.register_csv( "awesome_table", &format!( "../tableMetaData/anon/{}", "013ff3245cc90a3394943da0b6552b9f98313288e0a174f94f8dcb10.csv" ), &schema, true, ); let sql = "SELECT COUNT(partition) FROM awesome_table"; // create the query plan let plan = ctx.create_logical_plan(&sql)?; let plan = ctx.optimize(&plan)?; let plan = ctx.create_physical_plan(&plan, 1024 * 1024)?; // execute the query let results = ctx.collect(plan.as_ref())?; // iterate over the results results.iter().for_each(|batch| { println!( "RecordBatch has {} rows and {} columns", batch.num_rows(), batch.num_columns() ); let retf1 = batch.column(0).as_any().downcast_ref::<StringArray>(); match retf1 { Some(_) => println!("Some retf1"), None => println!("None retf1"), } let retf2 = batch.column(1).as_any().downcast_ref::<StringArray>(); match retf2 { Some(_) => println!("Some retf2"), None => println!("None retf2"), } // for i in 0..batch.num_rows() { // println!("P: {}", pcnt.value(i)); // } }); Ok(()) } {code} {code} cargo build --release && ./target/release/cli RecordBatch has 1 rows and 1 columns RecordBatch has 1 rows and 2 columns None retf1 None retf2 {code} In case of no aggregation: {code} let sql = "SELECT index FROM awesome_table"; // create the query plan let plan = ctx.create_logical_plan(&sql)?; let plan = ctx.optimize(&plan)?; let plan = ctx.create_physical_plan(&plan, 1024 * 1024)?; // execute the query let results = ctx.collect(plan.as_ref())?; // iterate over the results results.iter().for_each(|batch| { println!( "RecordBatch has {} rows and {} columns", batch.num_rows(), batch.num_columns(), ); let retf1 = batch.column(0).as_any().downcast_ref::<UInt32Array>(); match retf1 { Some(_) => println!("Some retf1"), None => println!("None retf1"), } // let retf2 = batch.column(1).as_any().downcast_ref::<UInt32Array>(); // match retf2 { // Some(_) => println!("Some retf2"), // None => println!("None retf2"), // } for i in 0..batch.num_rows() { println!("P: {}", retf1.unwrap().value(i)); } }); Ok(()) } {code} {code} P: 126436 P: 126437 P: 126438 P: 126439 P: 126440 P: 126441 {code} Is there a way to access the fields when aggregating? -- This message was sent by Atlassian Jira (v8.3.4#803005)