alamb commented on issue #13748: URL: https://github.com/apache/datafusion/issues/13748#issuecomment-2540158393
BTW here is the code I am using to generate those numbers (it is pretty grotty) <details><summary>Details</summary> <p> ```rust // DataFusion spilling sort benchmark / exmaples // Idea is to replicate a report from https://discord.com/channels/885562378132000778/1166447479609376850/1276137008435298335 // where sort doesn't spill // Related link: sorting strings use std::fs::File; use std::path::PathBuf; use datafusion::arrow::array::{ArrayRef, Int32Array}; use datafusion::arrow::datatypes::{Field, Fields, Schema}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::error::Result; use std::sync::Arc; use datafusion::arrow; use datafusion::prelude::SessionContext; #[tokio::main] async fn main() -> Result<()> { // initialize logging to see DataFusion's internal logging std::env::set_var("RUST_LOG", "info"); env_logger::init(); let ctx = SessionContext::new(); let n = 100; make_table(n); ctx.sql(&format!("CREATE EXTERNAL TABLE t({}) STORED AS CSV LOCATION 'data.csv' WITH ORDER (c1)", column_list(100))).await? .show().await?; println!("10 columns"); println!("{}", make_query(10)); println!("40 columns"); println!("{}", make_query(40)); for n in [10, 20, 30, 40, 50, 60, 70, 80, 90, 100] { let sql = make_query(n); print!("Running with {n} columns"); let start = std::time::Instant::now(); ctx.sql(&sql).await?.collect().await?; let elapsed = start.elapsed(); println!("...completed in {:?}", elapsed); } Ok(()) } fn column_list(n: usize) -> String { (0..n) .map(|i| format!("c{} int", i)) .collect::<Vec<_>>() .join(", ") } /// Writes a table like this to CSV file /// c1: int, c2: int, c3: int....cn:int /// /// returns the filename fn make_table(n: usize) -> PathBuf { let path = PathBuf::from("data.csv"); let arrays = (0..n) .map(|i| { let i = i as i32; let n = n as i32; Arc::new(Int32Array::from(vec![i * n, 2 * i * n, 3 * i * n])) as ArrayRef }) .collect::<Vec<_>>(); let schema = Schema::new(Fields::from( arrays .iter() .enumerate() .map(|(i, arr)| Field::new(&format!("c{}", i), arr.data_type().clone(), false)) .collect::<Vec<_>>(), )); let batch = RecordBatch::try_new(Arc::new(schema), arrays).unwrap(); let file = File::create(&path).unwrap(); let mut writer = arrow::csv::Writer::new(file); writer.write(&batch).unwrap(); // flush on drop // writer.into_inner(); path } /// return a query like /// ```sql /// select c1, null as c2, ... null as cn from t ORDER BY c1 /// UNION ALL /// select null as c1, c2, ... null as cn from t ORDER BY c2 /// ... /// select null as c1, null as c2, ... cn from t ORDER BY cn /// ORDER BY c1, c2 ... CN /// ``` fn make_query(n: usize) -> String { let mut query = String::new(); for i in 0..n { if i != 0 { query.push_str("\n UNION ALL \n"); } let select_list = (0..n) .map(|j| { if i == j { format!("c{j}") } else { format!("null as c{j}") } }) .collect::<Vec<_>>() .join(", "); query.push_str(&format!("(SELECT {} FROM t ORDER BY c{})", select_list, i)); } query.push_str(&format!( "\nORDER BY {}", (0..n) .map(|i| format!("c{}", i)) .collect::<Vec<_>>() .join(", ") )); query } ``` </p> </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
