Re: [PR] perf: Improve benchmarks for native row-to-columnar used by JVM shuffle [datafusion-comet]

via GitHub Tue, 27 Jan 2026 08:53:56 -0800


comphead commented on code in PR #3290:
URL: https://github.com/apache/datafusion-comet/pull/3290#discussion_r2732953016



##########
native/core/benches/row_columnar.rs:
##########
@@ -15,90 +15,822 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use arrow::datatypes::DataType as ArrowDataType;
+//! Benchmarks for JVM shuffle row-to-columnar conversion.
+//!
+//! This benchmark measures the performance of converting Spark UnsafeRow
+//! to Arrow arrays via `process_sorted_row_partition()`, which is called
+//! by JVM shuffle (CometColumnarShuffle) when writing shuffle data.
+//!
+//! Covers:
+//! - Primitive types (Int64)
+//! - Struct (flat, nested, deeply nested)
+//! - List
+//! - Map
+
+use arrow::datatypes::{DataType, Field, Fields};
 use comet::execution::shuffle::row::{
     process_sorted_row_partition, SparkUnsafeObject, SparkUnsafeRow,
 };
 use comet::execution::shuffle::CompressionCodec;
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use std::sync::Arc;
 use tempfile::Builder;
 
-const NUM_ROWS: usize = 10000;
 const BATCH_SIZE: usize = 5000;
-const NUM_COLS: usize = 100;
-const ROW_SIZE: usize = SparkUnsafeRow::get_row_bitset_width(NUM_COLS) + 
NUM_COLS * 8;
 
-fn benchmark(c: &mut Criterion) {
-    let mut group = c.benchmark_group("row_array_conversion");
+/// Create a struct schema with the given number of int64 fields.
+fn make_struct_schema(num_fields: usize) -> DataType {
+    let fields: Vec<Field> = (0..num_fields)
+        .map(|i| Field::new(format!("f{}", i), DataType::Int64, true))
+        .collect();
+    DataType::Struct(Fields::from(fields))
+}
 
-    group.bench_function("row_to_array", |b| {
-        let spark_rows = (0..NUM_ROWS)
-            .map(|_| {
-                let mut spark_row = 
SparkUnsafeRow::new_with_num_fields(NUM_COLS);
-                let mut row = Row::new();
+/// Calculate the row size for a struct with the given number of fields.
+/// UnsafeRow layout: [null bits] [fixed-length values]
+/// For struct: the struct value is stored as offset+size (8 bytes) pointing 
to nested row
+fn get_row_size(num_struct_fields: usize) -> usize {
+    // Top-level row has 1 column (the struct)
+    let top_level_bitset_width = SparkUnsafeRow::get_row_bitset_width(1);
+    // Struct pointer (offset + size) is 8 bytes
+    let struct_pointer_size = 8;
+    // Nested struct row
+    let nested_bitset_width = 
SparkUnsafeRow::get_row_bitset_width(num_struct_fields);
+    let nested_data_size = num_struct_fields * 8; // int64 values
 
-                for i in 
SparkUnsafeRow::get_row_bitset_width(NUM_COLS)..ROW_SIZE {
-                    row.data[i] = i as u8;
-                }
+    top_level_bitset_width + struct_pointer_size + nested_bitset_width + 
nested_data_size
+}
+
+struct RowData {
+    data: Vec<u8>,
+}
+
+impl RowData {
+    fn new(num_struct_fields: usize) -> Self {
+        let row_size = get_row_size(num_struct_fields);
+        let mut data = vec![0u8; row_size];
+
+        // Top-level row layout:
+        // [null bits for 1 field] [struct pointer (offset, size)]
+        let top_level_bitset_width = SparkUnsafeRow::get_row_bitset_width(1);
+
+        // Nested struct starts after top-level row header + pointer
+        let nested_offset = top_level_bitset_width + 8;

Review Comment:
   just a thought, its too many eights, prob it would be easy to name them? 
where is the pointer size or int64 size, etc? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] perf: Improve benchmarks for native row-to-columnar used by JVM shuffle [datafusion-comet]

Reply via email to