This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new ddfc282008 Add tests for sqllogictest prioritization (#20656)
ddfc282008 is described below

commit ddfc282008587b15947ff54c91497638df1ca2d4
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Mar 4 17:42:27 2026 -0500

    Add tests for sqllogictest prioritization (#20656)
    
    ## Which issue does this PR close?
    
    - Follow on to https://github.com/apache/datafusion/pull/20576
    
    ## Rationale for this change
    
    Implement suggestions from @kosiew in
    https://github.com/apache/datafusion/pull/20576#discussion_r2870761276:
    > Can we add a deterministic tie-breaker in sort_tests (for equal
    priority) using relative_path, e.g. sort_by_key(|f| (priority,
    f.relative_path.clone())) to keep run order stable?
    >
    > This would also benefit from a small unit test covering:
    >
    > prioritized files are first,
    > non-prioritized files keep deterministic ordering
    
    ## What changes are included in this PR?
    1. Add deterministic tie breaker so tests are always run in the same
    order
    2. Add a test for the ordering
    
    Note that in order to actually write these tests I needed to pull the
    TestFile and some related logic into its own module
    
    ## Are these changes tested?
    
    Yes by CI
    
    I also ran the tests locally via
    ```shell
    cargo test  --profile=ci --test sqllogictests
    ```
    
    I also double checked sqllogictests
    ```shell
    INCLUDE_SQLITE=true cargo test --profile release-nonlto --test sqllogictests
    ```
    
    ## Are there any user-facing changes?
    
    <!--
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    -->
    
    <!--
    If there are any breaking changes to public APIs, please add the `api
    change` label.
    -->
---
 datafusion/sqllogictest/bin/sqllogictests.rs | 156 ++++------------------
 datafusion/sqllogictest/src/lib.rs           |   2 +
 datafusion/sqllogictest/src/test_file.rs     | 186 +++++++++++++++++++++++++++
 3 files changed, 213 insertions(+), 131 deletions(-)

diff --git a/datafusion/sqllogictest/bin/sqllogictests.rs 
b/datafusion/sqllogictest/bin/sqllogictests.rs
index b5a382ca6a..38a7635042 100644
--- a/datafusion/sqllogictest/bin/sqllogictests.rs
+++ b/datafusion/sqllogictest/bin/sqllogictests.rs
@@ -18,9 +18,8 @@
 use clap::{ColorChoice, Parser, ValueEnum};
 use datafusion::common::instant::Instant;
 use datafusion::common::utils::get_available_parallelism;
-use datafusion::common::{
-    DataFusionError, HashMap, Result, exec_datafusion_err, exec_err,
-};
+use datafusion::common::{DataFusionError, Result, exec_datafusion_err, 
exec_err};
+use datafusion_sqllogictest::TestFile;
 use datafusion_sqllogictest::{
     CurrentlyExecutingSqlTracker, DataFusion, DataFusionSubstraitRoundTrip, 
Filter,
     TestContext, df_value_validator, read_dir_recursive, setup_scratch_dir,
@@ -44,13 +43,12 @@ use crate::postgres_container::{
 };
 use datafusion::common::runtime::SpawnedTask;
 use futures::FutureExt;
-use std::ffi::OsStr;
 use std::fs;
 use std::io::{IsTerminal, stderr, stdout};
 use std::path::{Path, PathBuf};
 use std::str::FromStr;
+use std::sync::Arc;
 use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::{Arc, LazyLock};
 use std::time::Duration;
 
 #[cfg(feature = "postgres")]
@@ -59,6 +57,7 @@ mod postgres_container;
 const TEST_DIRECTORY: &str = "test_files/";
 const DATAFUSION_TESTING_TEST_DIRECTORY: &str = 
"../../datafusion-testing/data/";
 const PG_COMPAT_FILE_PREFIX: &str = "pg_compat_";
+const TPCH_PREFIX: &str = "tpch";
 const SQLITE_PREFIX: &str = "sqlite";
 const ERRS_PER_FILE_LIMIT: usize = 10;
 const TIMING_DEBUG_SLOW_FILES_ENV: &str = "SLT_TIMING_DEBUG_SLOW_FILES";
@@ -77,55 +76,6 @@ struct FileTiming {
     elapsed: Duration,
 }
 
-/// TEST PRIORITY
-///
-/// Heuristically prioritize some test to run earlier.
-///
-/// Prioritizes test to run earlier if they are known to be long running (as
-/// each test file itself is run sequentially, but multiple test files are run
-/// in parallel.
-///
-/// Tests not listed here will run after the listed tests in an arbitrary 
order.
-///
-/// You can find the top longest running tests by running `--timing-summary` 
mode.
-/// For example
-///
-/// ```shell
-/// $  cargo test --profile=ci  --test sqllogictests -- --timing-summary top
-/// ...
-/// Per-file elapsed summary (deterministic):
-/// 1.    5.375s  push_down_filter_regression.slt
-/// 2.    3.174s  aggregate.slt
-/// 3.    3.158s  imdb.slt
-/// 4.    2.793s  joins.slt
-/// 5.    2.505s  array.slt
-/// 6.    2.265s  aggregate_skip_partial.slt
-/// 7.    2.260s  window.slt
-/// 8.    1.677s  group_by.slt
-/// 9.    0.973s  datetime/timestamps.slt
-/// 10.    0.822s  cte.slt
-/// ```
-static TEST_PRIORITY: LazyLock<HashMap<PathBuf, usize>> = LazyLock::new(|| {
-    [
-        (PathBuf::from("push_down_filter_regression.slt"), 0), // longest 
running, so run first.
-        (PathBuf::from("aggregate.slt"), 1),
-        (PathBuf::from("joins.slt"), 2),
-        (PathBuf::from("imdb.slt"), 3),
-        (PathBuf::from("array.slt"), 4),
-        (PathBuf::from("aggregate_skip_partial.slt"), 5),
-        (PathBuf::from("window.slt"), 6),
-        (PathBuf::from("group_by.slt"), 7),
-        (PathBuf::from("datetime/timestamps.slt"), 8),
-        (PathBuf::from("cte.slt"), 9),
-    ]
-    .into_iter()
-    .collect()
-});
-
-/// Default priority for tests not in the TEST_PRIORITY map. Tests with lower
-/// priority values run first.
-static DEFAULT_PRIORITY: usize = 100;
-
 pub fn main() -> Result<()> {
     tokio::runtime::Builder::new_multi_thread()
         .enable_all()
@@ -832,91 +782,35 @@ async fn run_complete_file_with_postgres(
     plan_err!("Can not run with postgres as postgres feature is not enabled")
 }
 
-/// Represents a parsed test file
-#[derive(Debug)]
-struct TestFile {
-    /// The absolute path to the file
-    pub path: PathBuf,
-    /// The relative path of the file (used for display)
-    pub relative_path: PathBuf,
-}
-
-impl TestFile {
-    fn new(path: PathBuf) -> Self {
-        let p = path.to_string_lossy();
-        let relative_path = PathBuf::from(if p.starts_with(TEST_DIRECTORY) {
-            p.strip_prefix(TEST_DIRECTORY).unwrap()
-        } else if p.starts_with(DATAFUSION_TESTING_TEST_DIRECTORY) {
-            p.strip_prefix(DATAFUSION_TESTING_TEST_DIRECTORY).unwrap()
-        } else {
-            ""
-        });
-
-        Self {
-            path,
-            relative_path,
-        }
-    }
-
-    fn is_slt_file(&self) -> bool {
-        self.path.extension() == Some(OsStr::new("slt"))
-    }
-
-    fn check_sqlite(&self, options: &Options) -> bool {
-        if !self.relative_path.starts_with(SQLITE_PREFIX) {
-            return true;
-        }
-
-        options.include_sqlite
-    }
-
-    fn check_tpch(&self, options: &Options) -> bool {
-        if !self.relative_path.starts_with("tpch") {
-            return true;
-        }
+fn read_test_files(options: &Options) -> Result<Vec<TestFile>> {
+    let prefixes: &[&str] = if options.include_sqlite {
+        &[TEST_DIRECTORY, DATAFUSION_TESTING_TEST_DIRECTORY]
+    } else {
+        &[TEST_DIRECTORY]
+    };
 
-        options.include_tpch
-    }
-}
+    let directories = prefixes
+        .iter()
+        .map(|prefix| {
+            read_dir_recursive(prefix).map_err(|e| {
+                exec_datafusion_err!("Error reading test directory {prefix}: 
{e}")
+            })
+        })
+        .collect::<Result<Vec<_>>>()?;
 
-fn read_test_files(options: &Options) -> Result<Vec<TestFile>> {
-    let mut paths = read_dir_recursive(TEST_DIRECTORY)?
+    let mut paths = directories
         .into_iter()
-        .map(TestFile::new)
+        .flatten()
+        .map(|p| TestFile::new(p, prefixes))
         .filter(|f| options.check_test_file(&f.path))
         .filter(|f| f.is_slt_file())
-        .filter(|f| f.check_tpch(options))
-        .filter(|f| f.check_sqlite(options))
+        .filter(|f| !f.relative_path_starts_with(TPCH_PREFIX) || 
options.include_tpch)
+        .filter(|f| !f.relative_path_starts_with(SQLITE_PREFIX) || 
options.include_sqlite)
         .filter(|f| options.check_pg_compat_file(f.path.as_path()))
         .collect::<Vec<_>>();
-    if options.include_sqlite {
-        let mut sqlite_paths = 
read_dir_recursive(DATAFUSION_TESTING_TEST_DIRECTORY)?
-            .into_iter()
-            .map(TestFile::new)
-            .filter(|f| options.check_test_file(&f.path))
-            .filter(|f| f.is_slt_file())
-            .filter(|f| f.check_sqlite(options))
-            .filter(|f| options.check_pg_compat_file(f.path.as_path()))
-            .collect::<Vec<_>>();
-
-        paths.append(&mut sqlite_paths)
-    }
 
-    Ok(sort_tests(paths))
-}
-
-/// Sort the tests heuristically by order of "priority"
-///
-/// Prioritizes test to run earlier if they are known to be long running (as
-/// each test file itself is run sequentially, but multiple test files are run
-/// in parallel.
-fn sort_tests(mut tests: Vec<TestFile>) -> Vec<TestFile> {
-    tests.sort_by_key(|f| {
-        TEST_PRIORITY
-            .get(&f.relative_path)
-            .unwrap_or(&DEFAULT_PRIORITY)
-    });
-    tests
+    paths.sort_unstable();
+    Ok(paths)
 }
 
 /// Parsed command line options
diff --git a/datafusion/sqllogictest/src/lib.rs 
b/datafusion/sqllogictest/src/lib.rs
index f228ee89ab..bb12c58bdc 100644
--- a/datafusion/sqllogictest/src/lib.rs
+++ b/datafusion/sqllogictest/src/lib.rs
@@ -27,6 +27,7 @@
 //! DataFusion sqllogictest driver
 
 mod engines;
+mod test_file;
 
 pub use engines::CurrentlyExecutingSqlTracker;
 pub use engines::DFColumnType;
@@ -46,4 +47,5 @@ mod util;
 
 pub use filters::*;
 pub use test_context::TestContext;
+pub use test_file::TestFile;
 pub use util::*;
diff --git a/datafusion/sqllogictest/src/test_file.rs 
b/datafusion/sqllogictest/src/test_file.rs
new file mode 100644
index 0000000000..c44cae1336
--- /dev/null
+++ b/datafusion/sqllogictest/src/test_file.rs
@@ -0,0 +1,186 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use std::collections::HashMap;
+use std::ffi::OsStr;
+use std::path::{Path, PathBuf};
+use std::sync::LazyLock;
+
+/// Represents a parsed test file
+///
+/// Note there is a custom Ord implementation that sorts test files by:
+/// 1. Hard coded test priority (lower runs first),
+/// 2. Relative path as deterministic tie-breaker.
+#[derive(Debug, PartialEq, Eq)]
+pub struct TestFile {
+    /// The absolute path to the file
+    pub path: PathBuf,
+    /// The relative path of the file (used for display)
+    pub relative_path: PathBuf,
+}
+
+impl TestFile {
+    /// Create a new [`TestFile`] from the given path, stripping any of the
+    /// known test directory prefixes for the relative path.
+    pub fn new(path: PathBuf, prefixes: &[&str]) -> Self {
+        let p = path.to_string_lossy();
+        for prefix in prefixes {
+            if p.starts_with(prefix) {
+                let relative_path = 
PathBuf::from(p.strip_prefix(prefix).unwrap());
+                return Self {
+                    path,
+                    relative_path,
+                };
+            }
+        }
+        let relative_path = PathBuf::from("");
+
+        Self {
+            path,
+            relative_path,
+        }
+    }
+
+    /// Returns true if the file has a .slt extension, indicating it is a 
sqllogictest file.
+    pub fn is_slt_file(&self) -> bool {
+        self.path.extension() == Some(OsStr::new("slt"))
+    }
+
+    /// Returns true if the relative path starts with the given prefix, which
+    /// can be used to filter tests by subdirectory or filename patterns.
+    pub fn relative_path_starts_with(&self, prefix: impl AsRef<Path>) -> bool {
+        self.relative_path.starts_with(prefix)
+    }
+}
+
+impl PartialOrd for TestFile {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for TestFile {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        let self_path = &self.relative_path;
+        let other_path = &other.relative_path;
+
+        let priority_self = 
TEST_PRIORITY.get(self_path).unwrap_or(&DEFAULT_PRIORITY);
+        let priority_other = 
TEST_PRIORITY.get(other_path).unwrap_or(&DEFAULT_PRIORITY);
+
+        priority_self
+            .cmp(priority_other)
+            .then_with(|| self_path.cmp(other_path)) // Tie-breaker: 
lexicographic order of relative paths.
+            // Final tie-breaker keeps Ord consistent with Eq when relative 
paths collide.
+            .then_with(|| self.path.cmp(&other.path))
+    }
+}
+
+/// TEST PRIORITY
+///
+/// Heuristically prioritize some test to run earlier.
+///
+/// Prioritizes test to run earlier if they are known to be long running (as
+/// each test file itself is run sequentially, but multiple test files are run
+/// in parallel.
+///
+/// Tests not listed here will run after the listed tests in deterministic
+/// lexicographic order by relative path.
+///
+/// You can find the top longest running tests by running `--timing-summary`
+/// mode. For example
+///
+/// ```shell
+/// $ cargo test --profile=ci --test sqllogictests -- --timing-summary top
+/// ...
+/// Per-file elapsed summary (deterministic):
+/// 1.    3.568s  aggregate.slt
+/// 2.    3.464s  joins.slt
+/// 3.    3.336s  imdb.slt
+/// 4.    3.085s  push_down_filter_regression.slt
+/// 5.    2.926s  aggregate_skip_partial.slt
+/// 6.    2.453s  array.slt
+/// 7.    2.399s  window.slt
+/// 8.    2.198s  group_by.slt
+/// 9.    1.281s  clickbench.slt
+/// 10.    1.058s  datetime/timestamps.slt
+/// ```
+const TEST_PRIORITY_ENTRIES: &[&str] = &[
+    "aggregate.slt", //  longest-running files go first
+    "joins.slt",
+    "imdb.slt",
+    "push_down_filter_regression.slt",
+    "aggregate_skip_partial.slt",
+    "array.slt",
+    "window.slt",
+    "group_by.slt",
+    "clickbench.slt",
+    "datetime/timestamps.slt",
+];
+
+/// Default priority for tests not in the priority map. Tests with lower
+/// priority values run first.
+const DEFAULT_PRIORITY: usize = 100;
+
+static TEST_PRIORITY: LazyLock<HashMap<PathBuf, usize>> = LazyLock::new(|| {
+    TEST_PRIORITY_ENTRIES
+        .iter()
+        .enumerate()
+        .map(|(priority, path)| (PathBuf::from(path), priority))
+        .collect()
+});
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn prioritized_files_are_first() {
+        let mut input = vec!["z_unlisted.slt", "a_unlisted.slt"];
+        input.extend(TEST_PRIORITY_ENTRIES.iter());
+        input.push("q_unlisted.slt");
+
+        let mut sorted = to_test_files(input);
+        sorted.sort_unstable();
+
+        println!("Sorted input: {sorted:?}");
+
+        // the prioritized files should be first, in the order specified by 
TEST_PRIORITY_ENTRIES
+        for file in sorted.iter().take(TEST_PRIORITY_ENTRIES.len()) {
+            assert!(
+                TEST_PRIORITY.contains_key(&file.relative_path),
+                "Expected prioritized file {file:?} not found in input 
{sorted:?}"
+            );
+        }
+        // last three files should be the unlisted ones in deterministic order
+        let expected_files =
+            to_test_files(["a_unlisted.slt", "q_unlisted.slt", 
"z_unlisted.slt"]);
+        assert!(
+            sorted.ends_with(&expected_files),
+            "Expected unlisted files {expected_files:?} at the end in 
deterministic order of {sorted:?}"
+        );
+    }
+
+    fn to_test_files<'a>(files: impl IntoIterator<Item = &'a str>) -> 
Vec<TestFile> {
+        files
+            .into_iter()
+            .map(|f| TestFile {
+                path: PathBuf::from(f),
+                relative_path: PathBuf::from(f),
+            })
+            .collect()
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to