Re: [PR] perf: Improve benchmarks for native row-to-columnar used by JVM shuffle [datafusion-comet]

via GitHub Tue, 27 Jan 2026 08:52:30 -0800


comphead commented on code in PR #3290:
URL: https://github.com/apache/datafusion-comet/pull/3290#discussion_r2732947276



##########
native/core/benches/row_columnar.rs:
##########
@@ -15,90 +15,822 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::datatypes::DataType as ArrowDataType;
+//! Benchmarks for JVM shuffle row-to-columnar conversion.
+//!
+//! This benchmark measures the performance of converting Spark UnsafeRow
+//! to Arrow arrays via `process_sorted_row_partition()`, which is called
+//! by JVM shuffle (CometColumnarShuffle) when writing shuffle data.
+//!
+//! Covers:
+//! - Primitive types (Int64)
+//! - Struct (flat, nested, deeply nested)
+//! - List
+//! - Map
+
+use arrow::datatypes::{DataType, Field, Fields};
 use comet::execution::shuffle::row::{
     process_sorted_row_partition, SparkUnsafeObject, SparkUnsafeRow,
 };
 use comet::execution::shuffle::CompressionCodec;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use std::sync::Arc;
 use tempfile::Builder;
 
-const NUM_ROWS: usize = 10000;
 const BATCH_SIZE: usize = 5000;
-const NUM_COLS: usize = 100;
-const ROW_SIZE: usize = SparkUnsafeRow::get_row_bitset_width(NUM_COLS) + 
NUM_COLS * 8;
 
-fn benchmark(c: &mut Criterion) {
-    let mut group = c.benchmark_group("row_array_conversion");
+/// Create a struct schema with the given number of int64 fields.
+fn make_struct_schema(num_fields: usize) -> DataType {
+    let fields: Vec<Field> = (0..num_fields)
+        .map(|i| Field::new(format!("f{}", i), DataType::Int64, true))
+        .collect();
+    DataType::Struct(Fields::from(fields))
+}
 
-    group.bench_function("row_to_array", |b| {
-        let spark_rows = (0..NUM_ROWS)
-            .map(|_| {
-                let mut spark_row = 
SparkUnsafeRow::new_with_num_fields(NUM_COLS);
-                let mut row = Row::new();
+/// Calculate the row size for a struct with the given number of fields.
+/// UnsafeRow layout: [null bits] [fixed-length values]
+/// For struct: the struct value is stored as offset+size (8 bytes) pointing 
to nested row
+fn get_row_size(num_struct_fields: usize) -> usize {
+    // Top-level row has 1 column (the struct)
+    let top_level_bitset_width = SparkUnsafeRow::get_row_bitset_width(1);
+    // Struct pointer (offset + size) is 8 bytes

Review Comment:
   ```suggestion
       // Struct pointer (offset + size) is 8 bytes on 64bit architectures 
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] perf: Improve benchmarks for native row-to-columnar used by JVM shuffle [datafusion-comet]

Reply via email to