Re: [PR] _repr_ and _html_repr_ show '... and additional rows' message [datafusion-python]

via GitHub Tue, 04 Mar 2025 00:22:40 -0800


kosiew commented on code in PR #1041:
URL: 
https://github.com/apache/datafusion-python/pull/1041#discussion_r1978864670



##########
src/dataframe.rs:
##########
@@ -90,59 +90,108 @@ impl PyDataFrame {
     }
 
     fn __repr__(&self, py: Python) -> PyDataFusionResult<String> {
-        let df = self.df.as_ref().clone().limit(0, Some(10))?;
+        // Get 11 rows to check if there are more than 10
+        let df = self.df.as_ref().clone().limit(0, Some(11))?;
         let batches = wait_for_future(py, df.collect())?;
-        let batches_as_string = pretty::pretty_format_batches(&batches);
+        let num_rows = batches.iter().map(|batch| 
batch.num_rows()).sum::<usize>();
+    
+        // Flatten batches into a single batch for the first 10 rows
+        let mut all_rows = Vec::new();
+        let mut total_rows = 0;
+        
+        for batch in &batches {
+            let num_rows_to_take = if total_rows + batch.num_rows() > 10 {
+                10 - total_rows
+            } else {
+                batch.num_rows()
+            };
+    
+            if num_rows_to_take > 0 {
+                let sliced_batch = batch.slice(0, num_rows_to_take);
+                all_rows.push(sliced_batch);
+                total_rows += num_rows_to_take;
+            }
+    
+            if total_rows >= 10 {
+                break;
+            }
+        }
+    
+        let batches_as_string = pretty::pretty_format_batches(&all_rows);
+    
         match batches_as_string {
-            Ok(batch) => Ok(format!("DataFrame()\n{batch}")),
+            Ok(batch) => {
+                if num_rows > 10 {
+                    Ok(format!("DataFrame()\n{batch}\n... and additional 
rows"))
+                } else {
+                    Ok(format!("DataFrame()\n{batch}"))
+                }
+            }
             Err(err) => Ok(format!("Error: {:?}", err.to_string())),
         }
     }
+    
+    
 
     fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> {
         let mut html_str = "<table border='1'>\n".to_string();
-
-        let df = self.df.as_ref().clone().limit(0, Some(10))?;
+    
+        // Limit to the first 11 rows
+        let df = self.df.as_ref().clone().limit(0, Some(11))?;
         let batches = wait_for_future(py, df.collect())?;
-
+    
+        // If there are no rows, close the table and return
         if batches.is_empty() {
             html_str.push_str("</table>\n");
             return Ok(html_str);
         }
-
+    
+        // Get schema for headers
         let schema = batches[0].schema();
-
+        
         let mut header = Vec::new();
         for field in schema.fields() {
-            header.push(format!("<th>{}</td>", field.name()));
+            header.push(format!("<th>{}</th>", field.name()));
         }
         let header_str = header.join("");
         html_str.push_str(&format!("<tr>{}</tr>\n", header_str));
-
-        for batch in batches {
+    
+        // Flatten rows and format them as HTML
+        let mut total_rows = 0;
+        for batch in &batches {
+            total_rows += batch.num_rows();
             let formatters = batch
                 .columns()
                 .iter()
                 .map(|c| ArrayFormatter::try_new(c.as_ref(), 
&FormatOptions::default()))
-                .map(|c| {
-                    c.map_err(|e| PyValueError::new_err(format!("Error: {:?}", 
e.to_string())))
-                })
+                .map(|c| c.map_err(|e| PyValueError::new_err(format!("Error: 
{:?}", e.to_string()))))
                 .collect::<Result<Vec<_>, _>>()?;
-
-            for row in 0..batch.num_rows() {
+    
+            let num_rows_to_render = if total_rows > 10 { 10 } else { 
batch.num_rows() };
+    
+            for row in 0..num_rows_to_render {
                 let mut cells = Vec::new();
                 for formatter in &formatters {
                     cells.push(format!("<td>{}</td>", formatter.value(row)));
                 }
                 let row_str = cells.join("");
                 html_str.push_str(&format!("<tr>{}</tr>\n", row_str));
             }
-        }
 
+            if total_rows >= 10 {
+                break;
+            }

Review Comment:
   How about simplifying to:
   ```rust
               let rows_remaining = 10 - total_rows;
               let rows_in_batch = batch.num_rows().min(rows_remaining);
   
               for row in 0..rows_in_batch {
                   html_str.push_str("<tr>");
                   for col in batch.columns() {
                       let formatter =
                           ArrayFormatter::try_new(col.as_ref(), 
&FormatOptions::default())?;
                       html_str.push_str("<td>");
                       html_str.push_str(&formatter.value(row).to_string());
                       html_str.push_str("</td>");
                   }
                   html_str.push_str("</tr>\n");
               }
   
               total_rows += rows_in_batch;
   ```
   Reasons:
   
   1. More Accurate Row Limiting:
   
   Before: total_rows was updated before checking the row limit, which could 
result in processing extra rows unnecessarily.
   After: rows_remaining = 10 - total_rows ensures that we never exceed the row 
limit.
   
   2. Avoids Redundant Vec<String> Allocation:
   
   Before: Each row was constructed using a Vec<String>, and format!() was used 
for each cell.
   After: Directly appends <td> elements to html_str, eliminating unnecessary 
heap allocations.
   
   3. Simplified and More Efficient Row Processing:
   
   Before:
   Used .map() and .collect() to create a list of ArrayFormatters before 
processing rows.
   After:
   Retrieves and formats values inside the loop, reducing redundant processing.
   
   4. Avoids Unnecessary break Condition:
   
   Before: Explicit if total_rows >= 10 { break; } was used to stop processing.
   After: The min(rows_remaining, batch.num_rows()) logic naturally prevents 
extra iterations



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Re: [PR] _repr_ and _html_repr_ show '... and additional rows' message [datafusion-python]

Reply via email to