(sedona-db) branch main updated: chore(ci): Separate Rust CI tasks into concurrent jobs (#9)

jiayu Fri, 29 Aug 2025 23:21:01 -0700

This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git



The following commit(s) were added to refs/heads/main by this push:
     new ed5ce7b  chore(ci): Separate Rust CI tasks into concurrent jobs (#9)
ed5ce7b is described below

commit ed5ce7b591637797d86585368abce9ced1e9ba30
Author: Dewey Dunnington <[email protected]>
AuthorDate: Sat Aug 30 06:20:51 2025 +0000

    chore(ci): Separate Rust CI tasks into concurrent jobs (#9)
    
    * stick with the matrix for now
    
    * restrict paths
    
    * make benchmarks tinier in test mode
---
 .github/workflows/rust.yml                |  47 +++++------
 rust/sedona-testing/src/benchmark_util.rs | 129 ++++++++++++++++++++++--------
 2 files changed, 118 insertions(+), 58 deletions(-)

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index dd86ff1..7069839 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -20,6 +20,12 @@ on:
   pull_request:
     branches:
       - main
+    paths:
+      - 'Cargo.toml'
+      - 'Cargo.lock'
+      - '.github/workflows/rust.yml'
+      - 'rust/**'
+      - 'c/**'
   push:
     branches:
       - main
@@ -39,9 +45,10 @@ jobs:
   rust:
     strategy:
       matrix:
-        os: [ubuntu-latest]
-    name: "rust ${{ matrix.os }}"
-    runs-on: ${{ matrix.os }}
+        name: ["clippy", "docs", "test"]
+
+    name: "${{ matrix.name }}"
+    runs-on: ubuntu-latest
     env:
       CARGO_INCREMENTAL: 0
     steps:
@@ -89,42 +96,30 @@ jobs:
           rustup default stable
       - uses: Swatinem/rust-cache@v2
         with:
-          # Update this key to force a new cache. When doing large dependency 
changes
-          # (e.g., updating DataFusion), the old cache is not very useful and 
may result
-          # in out-of-disk ("no space left on device")
-          prefix-key: "v4-df49"
-          cache-on-failure: false
-          cache-all-crates: false
+          # Update this key to force a new cache
+          prefix-key: "rust-${{ matrix.name }}-v1"
+
       - name: Install dependencies
         shell: bash
         run: sudo apt-get update && sudo apt-get install -y libgeos-dev
-      - name: Check formatting
-        run: |
-          cargo fmt --all -- --check
+
       - name: Clippy
+        if: matrix.name == 'clippy'
         run: |
-          pushd rust
           cargo clippy --workspace --all-targets --all-features -- -Dwarnings
-          popd
+
       - name: Test
+        if: matrix.name == 'test'
         run: |
-          pushd rust
           cargo test --workspace --all-targets --all-features
-          popd
+
       - name: Doctests
+        if: matrix.name == 'test'
         run: |
-          pushd rust
           cargo test --workspace --doc --all-features
-          popd
-      - name: Clean target directory before docs
-        run: |
-          pushd rust
-          # Clean up build artifacts to free space before generating docs
-          cargo clean
-          popd
+
       - name: Check docs
+        if: matrix.name == 'docs'
         run: |
-          pushd rust
           # Generate docs with reduced parallelism to avoid memory issues
           cargo doc --workspace --all-features -j 2
-          popd
diff --git a/rust/sedona-testing/src/benchmark_util.rs 
b/rust/sedona-testing/src/benchmark_util.rs
index 7c15de3..ad4cff3 100644
--- a/rust/sedona-testing/src/benchmark_util.rs
+++ b/rust/sedona-testing/src/benchmark_util.rs
@@ -36,12 +36,20 @@ use crate::{
 /// The default number of rows per batch (the same as the DataFusion default)
 pub const ROWS_PER_BATCH: usize = 8192;
 
+/// The number of rows per batch to use for tiny size benchmarks
+pub const ROWS_PER_BATCH_TINY: usize = 1024;
+
 /// The default number of batches to use for small size benchmarks
 ///
 /// This was chosen to ensure that most benchmarks run nicely with criterion
 /// defaults (target 5s, 100 samples).
 pub const NUM_BATCHES_SMALL: usize = 16;
 
+/// The default number of batches to use for tiny size benchmarks
+///
+/// Just one batch for testing that benchmarks actually run.
+pub const NUM_BATCHES_TINY: usize = 1;
+
 #[cfg(feature = "criterion")]
 pub mod benchmark {
     use super::*;
@@ -68,7 +76,13 @@ pub mod benchmark {
             .expect(&not_found_err)
             .clone()
             .into();
-        let data = config.into().build_data(NUM_BATCHES_SMALL).unwrap();
+        let data = config
+            .into()
+            .build_data(
+                Config::default().num_batches(),
+                Config::default().rows_per_batch(),
+            )
+            .unwrap();
         c.bench_function(&data.make_label(lib, name), |b| {
             b.iter(|| data.invoke_scalar(&udf).unwrap())
         });
@@ -95,11 +109,48 @@ pub mod benchmark {
             .expect(&not_found_err)
             .clone()
             .into();
-        let data = config.into().build_data(NUM_BATCHES_SMALL).unwrap();
+        let data = config
+            .into()
+            .build_data(
+                Config::default().num_batches(),
+                Config::default().rows_per_batch(),
+            )
+            .unwrap();
         c.bench_function(&data.make_label(lib, name), |b| {
             b.iter(|| data.invoke_aggregate(&udf).unwrap())
         });
     }
+
+    pub enum Config {
+        Tiny,
+        Small,
+    }
+
+    impl Default for Config {
+        fn default() -> Self {
+            #[cfg(debug_assertions)]
+            return Self::Tiny;
+
+            #[cfg(not(debug_assertions))]
+            return Self::Small;
+        }
+    }
+
+    impl Config {
+        fn num_batches(&self) -> usize {
+            match self {
+                Config::Tiny => NUM_BATCHES_TINY,
+                Config::Small => NUM_BATCHES_SMALL,
+            }
+        }
+
+        fn rows_per_batch(&self) -> usize {
+            match self {
+                Config::Tiny => ROWS_PER_BATCH_TINY,
+                Config::Small => ROWS_PER_BATCH,
+            }
+        }
+    }
 }
 
 /// Specification for benchmark arguments
@@ -135,7 +186,7 @@ impl BenchmarkArgs {
     }
 
     /// Build [BenchmarkData] with the specified number of batches
-    pub fn build_data(&self, num_batches: usize) -> Result<BenchmarkData> {
+    pub fn build_data(&self, num_batches: usize, rows_per_batch: usize) -> 
Result<BenchmarkData> {
         let array_configs = match self {
             BenchmarkArgs::Array(_)
             | BenchmarkArgs::ArrayArray(_, _)
@@ -161,7 +212,7 @@ impl BenchmarkArgs {
         let arrays = array_configs
             .iter()
             .enumerate()
-            .map(|(i, col)| col.build_arrays(i, num_batches))
+            .map(|(i, col)| col.build_arrays(i, num_batches, rows_per_batch))
             .collect::<Result<Vec<_>>>()?;
 
         let scalars = scalar_configs
@@ -251,7 +302,7 @@ impl BenchmarkArgSpec {
     /// This currently builds the same non-null scalar for each unique value
     /// of i (the argument number).
     pub fn build_scalar(&self, i: usize) -> Result<ScalarValue> {
-        let array = self.build_arrays(i, 1)?;
+        let array = self.build_arrays(i, 1, 1)?;
         ScalarValue::try_from_array(&array[0], 0)
     }
 
@@ -259,31 +310,44 @@ impl BenchmarkArgSpec {
     ///
     /// This currently builds the same column for each unique value of i (the 
argument
     /// number). The batch size is currently fixed to 8192 (the DataFusion 
default).
-    pub fn build_arrays(&self, i: usize, num_batches: usize) -> 
Result<Vec<ArrayRef>> {
+    pub fn build_arrays(
+        &self,
+        i: usize,
+        num_batches: usize,
+        rows_per_batch: usize,
+    ) -> Result<Vec<ArrayRef>> {
         match self {
             BenchmarkArgSpec::Point => {
-                self.build_geometry(i, GeometryTypeId::Point, num_batches, 1)
-            }
-            BenchmarkArgSpec::LineString(vertex_count) => {
-                self.build_geometry(i, GeometryTypeId::LineString, 
num_batches, *vertex_count)
-            }
-            BenchmarkArgSpec::Polygon(vertex_count) => {
-                self.build_geometry(i, GeometryTypeId::Polygon, num_batches, 
*vertex_count)
+                self.build_geometry(i, GeometryTypeId::Point, num_batches, 1, 
rows_per_batch)
             }
+            BenchmarkArgSpec::LineString(vertex_count) => self.build_geometry(
+                i,
+                GeometryTypeId::LineString,
+                num_batches,
+                *vertex_count,
+                rows_per_batch,
+            ),
+            BenchmarkArgSpec::Polygon(vertex_count) => self.build_geometry(
+                i,
+                GeometryTypeId::Polygon,
+                num_batches,
+                *vertex_count,
+                rows_per_batch,
+            ),
             BenchmarkArgSpec::Float64(lo, hi) => {
                 let mut rng = self.rng(i);
                 let dist = Uniform::new(lo, hi);
                 (0..num_batches)
                     .map(|_| -> Result<ArrayRef> {
                         let float64_array: Float64Array =
-                            (0..ROWS_PER_BATCH).map(|_| 
rng.sample(dist)).collect();
+                            (0..rows_per_batch).map(|_| 
rng.sample(dist)).collect();
                         Ok(Arc::new(float64_array))
                     })
                     .collect()
             }
             BenchmarkArgSpec::Transformed(inner, t) => {
                 let inner_type = inner.sedona_type();
-                let inner_arrays = inner.build_arrays(i, num_batches)?;
+                let inner_arrays = inner.build_arrays(i, num_batches, 
rows_per_batch)?;
                 let tester = ScalarUdfTester::new(t.clone(), vec![inner_type]);
                 inner_arrays
                     .into_iter()
@@ -294,7 +358,7 @@ impl BenchmarkArgSpec {
                 let string_array = (0..num_batches)
                     .map(|_| {
                         let array = arrow_array::StringArray::from_iter_values(
-                            std::iter::repeat_n(s, ROWS_PER_BATCH),
+                            std::iter::repeat_n(s, rows_per_batch),
                         );
                         Ok(Arc::new(array) as ArrayRef)
                     })
@@ -310,10 +374,11 @@ impl BenchmarkArgSpec {
         geom_type: GeometryTypeId,
         num_batches: usize,
         vertex_count: usize,
+        rows_per_batch: usize,
     ) -> Result<Vec<ArrayRef>> {
         let builder = RandomPartitionedDataBuilder::new()
             .num_partitions(1)
-            .rows_per_batch(ROWS_PER_BATCH)
+            .rows_per_batch(rows_per_batch)
             .batches_per_partition(num_batches)
             // Use a random geometry range that is also not unrealistic for 
geography
             .bounds(Rect::new((-10.0, -10.0), (10.0, 10.0)))
@@ -469,14 +534,14 @@ mod test {
         let (spec, geometry_type, point_count) = config;
         assert_eq!(spec.sedona_type(), WKB_GEOMETRY);
 
-        let arrays = spec.build_arrays(0, 2).unwrap();
+        let arrays = spec.build_arrays(0, 2, ROWS_PER_BATCH).unwrap();
         assert_eq!(arrays.len(), 2);
 
         // Make sure this is deterministic
-        assert_eq!(spec.build_arrays(0, 2).unwrap(), arrays);
+        assert_eq!(spec.build_arrays(0, 2, ROWS_PER_BATCH).unwrap(), arrays);
 
         // Make sure we generate different arrays for different argument 
numbers
-        assert_ne!(spec.build_arrays(1, 2).unwrap(), arrays);
+        assert_ne!(spec.build_arrays(1, 2, ROWS_PER_BATCH).unwrap(), arrays);
 
         for array in arrays {
             assert_eq!(
@@ -506,14 +571,14 @@ mod test {
         let spec = BenchmarkArgSpec::Float64(1.0, 2.0);
         assert_eq!(spec.sedona_type(), DataType::Float64.try_into().unwrap());
 
-        let arrays = spec.build_arrays(0, 2).unwrap();
+        let arrays = spec.build_arrays(0, 2, ROWS_PER_BATCH).unwrap();
         assert_eq!(arrays.len(), 2);
 
         // Make sure this is deterministic
-        assert_eq!(spec.build_arrays(0, 2).unwrap(), arrays);
+        assert_eq!(spec.build_arrays(0, 2, ROWS_PER_BATCH).unwrap(), arrays);
 
         // Make sure we generate different arrays for different argument 
numbers
-        assert_ne!(spec.build_arrays(1, 2).unwrap(), arrays);
+        assert_ne!(spec.build_arrays(1, 2, ROWS_PER_BATCH).unwrap(), arrays);
 
         for array in arrays {
             assert_eq!(array.data_type(), &DataType::Float64);
@@ -537,14 +602,14 @@ mod test {
         assert_eq!(spec.sedona_type(), DataType::Float32.try_into().unwrap());
 
         assert_eq!(format!("{spec:?}"), "float32(Float64(1.0, 2.0))");
-        let arrays = spec.build_arrays(0, 2).unwrap();
+        let arrays = spec.build_arrays(0, 2, ROWS_PER_BATCH).unwrap();
         assert_eq!(arrays.len(), 2);
 
         // Make sure this is deterministic
-        assert_eq!(spec.build_arrays(0, 2).unwrap(), arrays);
+        assert_eq!(spec.build_arrays(0, 2, ROWS_PER_BATCH).unwrap(), arrays);
 
         // Make sure we generate different arrays for different argument 
numbers
-        assert_ne!(spec.build_arrays(1, 2).unwrap(), arrays);
+        assert_ne!(spec.build_arrays(1, 2, ROWS_PER_BATCH).unwrap(), arrays);
 
         for array in arrays {
             assert_eq!(array.data_type(), &DataType::Float32);
@@ -558,7 +623,7 @@ mod test {
         let spec = BenchmarkArgs::Array(BenchmarkArgSpec::Point);
         assert_eq!(spec.sedona_types(), [WKB_GEOMETRY]);
 
-        let data = spec.build_data(2).unwrap();
+        let data = spec.build_data(2, ROWS_PER_BATCH).unwrap();
         assert_eq!(data.num_batches, 2);
         assert_eq!(data.arrays.len(), 1);
         assert_eq!(data.scalars.len(), 0);
@@ -581,7 +646,7 @@ mod test {
             [WKB_GEOMETRY, DataType::Float64.try_into().unwrap()]
         );
 
-        let data = spec.build_data(2).unwrap();
+        let data = spec.build_data(2, ROWS_PER_BATCH).unwrap();
         assert_eq!(data.num_batches, 2);
 
         assert_eq!(data.arrays.len(), 1);
@@ -606,7 +671,7 @@ mod test {
             [WKB_GEOMETRY, DataType::Float64.try_into().unwrap()]
         );
 
-        let data = spec.build_data(2).unwrap();
+        let data = spec.build_data(2, ROWS_PER_BATCH).unwrap();
         assert_eq!(data.num_batches, 2);
 
         assert_eq!(data.scalars.len(), 1);
@@ -629,7 +694,7 @@ mod test {
             [WKB_GEOMETRY, DataType::Float64.try_into().unwrap()]
         );
 
-        let data = spec.build_data(2).unwrap();
+        let data = spec.build_data(2, ROWS_PER_BATCH).unwrap();
         assert_eq!(data.num_batches, 2);
         assert_eq!(data.arrays.len(), 2);
         assert_eq!(data.scalars.len(), 0);
@@ -660,7 +725,7 @@ mod test {
             ]
         );
 
-        let data = spec.build_data(2).unwrap();
+        let data = spec.build_data(2, ROWS_PER_BATCH).unwrap();
         assert_eq!(data.num_batches, 2);
         assert_eq!(data.arrays.len(), 1);
         assert_eq!(data.scalars.len(), 2);
@@ -689,7 +754,7 @@ mod test {
             ]
         );
 
-        let data = spec.build_data(2).unwrap();
+        let data = spec.build_data(2, ROWS_PER_BATCH).unwrap();
         assert_eq!(data.num_batches, 2);
         assert_eq!(data.arrays.len(), 3);
         assert_eq!(data.scalars.len(), 1);

(sedona-db) branch main updated: chore(ci): Separate Rust CI tasks into concurrent jobs (#9)

Reply via email to