kosiew commented on code in PR #1036: URL: https://github.com/apache/datafusion-python/pull/1036#discussion_r2002243419
########## python/tests/test_dataframe.py: ########## @@ -1191,13 +1192,17 @@ def add_with_parameter(df_internal, value: Any) -> DataFrame: def test_dataframe_repr_html(df) -> None: output = df._repr_html_() Review Comment: Would it be a good idea to test for other df fixtures too? not specific to this PR, maybe add an empty_df fixture for tests too eg ``` @pytest.fixture def empty_df(): ctx = SessionContext() # Create an empty RecordBatch with the same schema as df batch = pa.RecordBatch.from_arrays( [ pa.array([], type=pa.int64()), pa.array([], type=pa.int64()), pa.array([], type=pa.int64()), ], names=["a", "b", "c"], ) return ctx.from_arrow(batch) @pytest.mark.parametrize( "dataframe_fixture", ["empty_df", "df", "nested_df", "struct_df", "partitioned_df", "aggregate_df"], ) def test_dataframe_repr_html(request, dataframe_fixture) -> None: df = request.getfixturevalue(dataframe_fixture) output = df._repr_html_() ``` ########## src/dataframe.rs: ########## @@ -111,56 +116,151 @@ impl PyDataFrame { } fn __repr__(&self, py: Python) -> PyDataFusionResult<String> { - let df = self.df.as_ref().clone().limit(0, Some(10))?; - let batches = wait_for_future(py, df.collect())?; - let batches_as_string = pretty::pretty_format_batches(&batches); - match batches_as_string { - Ok(batch) => Ok(format!("DataFrame()\n{batch}")), - Err(err) => Ok(format!("Error: {:?}", err.to_string())), + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display(self.df.as_ref().clone(), 10, 10), + )?; + if batches.is_empty() { + // This should not be reached, but do it for safety since we index into the vector below + return Ok("No data to display".to_string()); } - } - fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> { - let mut html_str = "<table border='1'>\n".to_string(); + let batches_as_displ = + pretty::pretty_format_batches(&batches).map_err(py_datafusion_err)?; + + let additional_str = match has_more { + true => "\nData truncated.", + false => "", + }; - let df = self.df.as_ref().clone().limit(0, Some(10))?; - let batches = wait_for_future(py, df.collect())?; + Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}")) + } + fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> { + let (batches, has_more) = wait_for_future( + py, + collect_record_batches_to_display( + self.df.as_ref().clone(), + MIN_TABLE_ROWS_TO_DISPLAY, + usize::MAX, + ), + )?; Review Comment: Extracting some variables into helper functions could make this more readable and easier to maintain eg: ``` fn _repr_html_(&self, py: Python) -> PyDataFusionResult<String> { let (batches, has_more) = wait_for_future( py, collect_record_batches_to_display( self.df.as_ref().clone(), MIN_TABLE_ROWS_TO_DISPLAY, usize::MAX, ), )?; if batches.is_empty() { // This should not be reached, but do it for safety since we index into the vector below return Ok("No data to display".to_string()); } let table_uuid = uuid::Uuid::new_v4().to_string(); let schema = batches[0].schema(); // Get table formatters for displaying cell values let batch_formatters = get_batch_formatters(&batches)?; let rows_per_batch = batches.iter().map(|batch| batch.num_rows()); // Generate HTML components let mut html_str = generate_html_table_header(&schema); html_str.push_str(&generate_table_rows( &batch_formatters, rows_per_batch, &table_uuid )?); html_str.push_str("</tbody></table></div>\n"); html_str.push_str(&generate_javascript()); if has_more { html_str.push_str("Data truncated due to size."); } Ok(html_str) } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org