kosiew commented on code in PR #1041: URL: https://github.com/apache/datafusion-python/pull/1041#discussion_r1978864670
########## src/dataframe.rs: ########## @@ -90,59 +90,108 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult<String> { - let df = self.df.as_ref().clone().limit(0, Some(10))?; + // Get 11 rows to check if there are more than 10 + let df = self.df.as_ref().clone().limit(0, Some(11))?; let batches = wait_for_future(py, df.collect())?; - let batches_as_string = pretty::pretty_format_batches(&batches); + let num_rows = batches.iter().map(|batch| batch.num_rows()).sum::<usize>(); + + // Flatten batches into a single batch for the first 10 rows + let mut all_rows = Vec::new(); + let mut total_rows = 0; + + for batch in &batches { + let num_rows_to_take = if total_rows + batch.num_rows() > 10 { + 10 - total_rows + } else { + batch.num_rows() + }; + + if num_rows_to_take > 0 { + let sliced_batch = batch.slice(0, num_rows_to_take); + all_rows.push(sliced_batch); + total_rows += num_rows_to_take; + } + + if total_rows >= 10 { + break; + } + } + + let batches_as_string = pretty::pretty_format_batches(&all_rows); + match batches_as_string { - Ok(batch) => Ok(format!("DataFrame()\n{batch}")), + Ok(batch) => { + if num_rows > 10 { + Ok(format!("DataFrame()\n{batch}\n... and additional rows")) + } else { + Ok(format!("DataFrame()\n{batch}")) + } + } Err(err) => Ok(format!("Error: {:?}", err.to_string())), } } + + fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> { let mut html_str = "<table border='1'>\n".to_string(); - - let df = self.df.as_ref().clone().limit(0, Some(10))?; + + // Limit to the first 11 rows + let df = self.df.as_ref().clone().limit(0, Some(11))?; let batches = wait_for_future(py, df.collect())?; - + + // If there are no rows, close the table and return if batches.is_empty() { html_str.push_str("</table>\n"); return Ok(html_str); } - + + // Get schema for headers let schema = batches[0].schema(); - + let mut header = Vec::new(); for field in schema.fields() { - header.push(format!("<th>{}</td>", field.name())); + header.push(format!("<th>{}</th>", field.name())); } let header_str = header.join(""); html_str.push_str(&format!("<tr>{}</tr>\n", header_str)); - - for batch in batches { + + // Flatten rows and format them as HTML + let mut total_rows = 0; + for batch in &batches { + total_rows += batch.num_rows(); let formatters = batch .columns() .iter() .map(|c| ArrayFormatter::try_new(c.as_ref(), &FormatOptions::default())) - .map(|c| { - c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string()))) - }) + .map(|c| c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", e.to_string())))) .collect::<Result<Vec<_>, _>>()?; - - for row in 0..batch.num_rows() { + + let num_rows_to_render = if total_rows > 10 { 10 } else { batch.num_rows() }; + + for row in 0..num_rows_to_render { let mut cells = Vec::new(); for formatter in &formatters { cells.push(format!("<td>{}</td>", formatter.value(row))); } let row_str = cells.join(""); html_str.push_str(&format!("<tr>{}</tr>\n", row_str)); } - } + if total_rows >= 10 { + break; + } Review Comment: How about simplifying to: ```rust let rows_remaining = 10 - total_rows; let rows_in_batch = batch.num_rows().min(rows_remaining); for row in 0..rows_in_batch { html_str.push_str("<tr>"); for col in batch.columns() { let formatter = ArrayFormatter::try_new(col.as_ref(), &FormatOptions::default())?; html_str.push_str("<td>"); html_str.push_str(&formatter.value(row).to_string()); html_str.push_str("</td>"); } html_str.push_str("</tr>\n"); } total_rows += rows_in_batch; ``` Reasons: 1. More Accurate Row Limiting: Before: total_rows was updated before checking the row limit, which could result in processing extra rows unnecessarily. After: rows_remaining = 10 - total_rows ensures that we never exceed the row limit. 2. Avoids Redundant Vec<String> Allocation: Before: Each row was constructed using a Vec<String>, and format!() was used for each cell. After: Directly appends <td> elements to html_str, eliminating unnecessary heap allocations. 3. Simplified and More Efficient Row Processing: Before: Used .map() and .collect() to create a list of ArrayFormatters before processing rows. After: Retrieves and formats values inside the loop, reducing redundant processing. 4. Avoids Unnecessary break Condition: Before: Explicit if total_rows >= 10 { break; } was used to stop processing. After: The min(rows_remaining, batch.num_rows()) logic naturally prevents extra iterations -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org