alamb commented on code in PR #13672:
URL: https://github.com/apache/datafusion/pull/13672#discussion_r1894533684


##########
datafusion-cli/tests/cli_integration.rs:
##########
@@ -17,42 +17,223 @@
 
 use std::process::Command;
 
-use assert_cmd::prelude::{CommandCargoExt, OutputAssertExt};
-use predicates::prelude::predicate;
 use rstest::rstest;
 
+use aws_config::Region;
+use aws_credential_types::Credentials;
+use aws_sdk_s3::error::SdkError;
+use aws_sdk_ssooidc::config::BehaviorVersion;
+use insta::{glob, Settings};
+use insta_cmd::{assert_cmd_snapshot, get_cargo_bin};
+use std::{env, fs};
+
+fn cli() -> Command {
+    Command::new(get_cargo_bin("datafusion-cli"))
+}
+
+fn make_settings() -> Settings {
+    let mut settings = Settings::clone_current();
+    settings.set_prepend_module_to_snapshot(false);
+    settings.add_filter(r"Elapsed .* seconds\.", "[ELAPSED]");
+    settings.add_filter(r"DataFusion CLI v.*", "[CLI_VERSION]");
+    settings
+}
+
 #[cfg(test)]
 #[ctor::ctor]
 fn init() {
     // Enable RUST_LOG logging configuration for tests
     let _ = env_logger::try_init();
 }
 
-// Disabled due to https://github.com/apache/datafusion/issues/10793
-#[cfg(not(target_family = "windows"))]
 #[rstest]
-#[case::exec_from_commands(
-    ["--command", "select 1", "--format", "json", "-q"],
-    "[{\"Int64(1)\":1}]\n"
-)]
 #[case::exec_multiple_statements(
-    ["--command", "select 1; select 2;", "--format", "json", "-q"],
-    "[{\"Int64(1)\":1}]\n[{\"Int64(2)\":2}]\n"
+    "statements",
+    ["--command", "select 1; select 2;", "-q"],
 )]
 #[case::exec_from_files(
-    ["--file", "tests/data/sql.txt", "--format", "json", "-q"],
-    "[{\"Int64(1)\":1}]\n"
+    "files",
+    ["--file", "tests/sql/select.sql", "-q"],
 )]
 #[case::set_batch_size(
-    ["--command", "show datafusion.execution.batch_size", "--format", "json", 
"-q", "-b", "1"],
-    "[{\"name\":\"datafusion.execution.batch_size\",\"value\":\"1\"}]\n"
+    "batch_size",
+    ["--command", "show datafusion.execution.batch_size", "-q", "-b", "1"],
 )]
 #[test]
 fn cli_quick_test<'a>(
+    #[case] snapshot_name: &'a str,
     #[case] args: impl IntoIterator<Item = &'a str>,
-    #[case] expected: &str,
 ) {
-    let mut cmd = Command::cargo_bin("datafusion-cli").unwrap();
+    let mut settings = make_settings();
+    settings.set_snapshot_suffix(snapshot_name);
+    let _bound = settings.bind_to_scope();
+
+    let mut cmd = cli();
     cmd.args(args);
-    cmd.assert().stdout(predicate::eq(expected));
+
+    assert_cmd_snapshot!(cmd);
+}
+
+#[rstest]
+#[case("csv")]
+#[case("tsv")]
+#[case("table")]
+#[case("json")]
+#[case("nd-json")]
+#[case("automatic")]
+#[test]
+fn test_cli_format<'a>(#[case] format: &'a str) {

Review Comment:
   I was wondering why we need all the new aws-s3-sdk dependencies and it looks 
like it is needed in order to programmatically setup the bucket. 
   
   I think minio also supports just serving the contents of directories as s3 
files
   
   Did you consider looking at just configuring minio directly rather than 
using the AWS S3 ASK just to create. a bucket?
   
   I know it doesn't seem like a big deal, but this PR adds many dependencies 
(that is why Cargo.lock is so big) and we try to keep the dependency chain down 
as much as possible (it is already large)
   
   I realize it is only for datafusion-cli dev, but that dev directly impacts 
maintenance (



##########
datafusion-cli/CONTRIBUTING.md:
##########
@@ -0,0 +1,72 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Development instructions
+
+## Running Tests
+
+Tests can be run using `cargo`
+
+```shell
+cargo test
+```
+
+## Snapshot testing
+
+To test CLI output, [Insta](https://github.com/mitsuhiko/insta) is used for 
snapshot testing. Snapshots are generated
+and compared on each test run. If the output changes, tests will fail.
+To review the changes, you can use Insta CLI:
+
+```shell
+cargo install cargo-insta
+cargo insta review

Review Comment:
   we use this in influxdb_iox and I find it super useful



##########
datafusion-cli/tests/cli_integration.rs:
##########
@@ -17,42 +17,223 @@
 
 use std::process::Command;
 
-use assert_cmd::prelude::{CommandCargoExt, OutputAssertExt};
-use predicates::prelude::predicate;
 use rstest::rstest;
 
+use aws_config::Region;
+use aws_credential_types::Credentials;
+use aws_sdk_s3::error::SdkError;
+use aws_sdk_ssooidc::config::BehaviorVersion;
+use insta::{glob, Settings};
+use insta_cmd::{assert_cmd_snapshot, get_cargo_bin};
+use std::{env, fs};
+
+fn cli() -> Command {
+    Command::new(get_cargo_bin("datafusion-cli"))
+}
+
+fn make_settings() -> Settings {
+    let mut settings = Settings::clone_current();
+    settings.set_prepend_module_to_snapshot(false);
+    settings.add_filter(r"Elapsed .* seconds\.", "[ELAPSED]");
+    settings.add_filter(r"DataFusion CLI v.*", "[CLI_VERSION]");
+    settings
+}
+
 #[cfg(test)]
 #[ctor::ctor]
 fn init() {
     // Enable RUST_LOG logging configuration for tests
     let _ = env_logger::try_init();
 }
 
-// Disabled due to https://github.com/apache/datafusion/issues/10793
-#[cfg(not(target_family = "windows"))]
 #[rstest]
-#[case::exec_from_commands(
-    ["--command", "select 1", "--format", "json", "-q"],
-    "[{\"Int64(1)\":1}]\n"
-)]
 #[case::exec_multiple_statements(
-    ["--command", "select 1; select 2;", "--format", "json", "-q"],
-    "[{\"Int64(1)\":1}]\n[{\"Int64(2)\":2}]\n"
+    "statements",
+    ["--command", "select 1; select 2;", "-q"],
 )]
 #[case::exec_from_files(
-    ["--file", "tests/data/sql.txt", "--format", "json", "-q"],
-    "[{\"Int64(1)\":1}]\n"
+    "files",
+    ["--file", "tests/sql/select.sql", "-q"],
 )]
 #[case::set_batch_size(
-    ["--command", "show datafusion.execution.batch_size", "--format", "json", 
"-q", "-b", "1"],
-    "[{\"name\":\"datafusion.execution.batch_size\",\"value\":\"1\"}]\n"
+    "batch_size",
+    ["--command", "show datafusion.execution.batch_size", "-q", "-b", "1"],
 )]
 #[test]
 fn cli_quick_test<'a>(
+    #[case] snapshot_name: &'a str,
     #[case] args: impl IntoIterator<Item = &'a str>,
-    #[case] expected: &str,
 ) {
-    let mut cmd = Command::cargo_bin("datafusion-cli").unwrap();
+    let mut settings = make_settings();
+    settings.set_snapshot_suffix(snapshot_name);
+    let _bound = settings.bind_to_scope();
+
+    let mut cmd = cli();
     cmd.args(args);
-    cmd.assert().stdout(predicate::eq(expected));
+
+    assert_cmd_snapshot!(cmd);
+}
+
+#[rstest]
+#[case("csv")]
+#[case("tsv")]
+#[case("table")]
+#[case("json")]
+#[case("nd-json")]
+#[case("automatic")]
+#[test]
+fn test_cli_format<'a>(#[case] format: &'a str) {
+    let mut settings = make_settings();
+    settings.set_snapshot_suffix(format);
+    let _bound = settings.bind_to_scope();
+
+    let mut cmd = cli();
+    cmd.args(["--command", "select 1", "-q", "--format", format]);
+
+    assert_cmd_snapshot!(cmd);
+}
+
+async fn s3_client() -> aws_sdk_s3::Client {
+    let access_key_id =
+        env::var("AWS_ACCESS_KEY_ID").expect("AWS_ACCESS_KEY_ID is not set");
+    let secret_access_key =
+        env::var("AWS_SECRET_ACCESS_KEY").expect("AWS_SECRET_ACCESS_KEY is not 
set");
+
+    let endpoint_url = env::var("AWS_ENDPOINT").expect("AWS_ENDPOINT is not 
set");
+    let region = 
Region::new(env::var("AWS_REGION").unwrap_or("df-test".to_string()));
+
+    let allow_non_test_credentials = env::var("ALLOW_NON_TEST_CREDENTIALS")
+        .map(|v| v == "1")
+        .unwrap_or(false);
+
+    if allow_non_test_credentials
+        || !access_key_id.starts_with("TEST-")
+        || !secret_access_key.starts_with("TEST-")
+    {
+        panic!("Refusing with non-test credentials. Either set 
ALLOW_NON_TEST_CREDENTIALS=1 or add TEST- for AWS_ACCESS_KEY_ID and 
AWS_SECRET_ACCESS_KEY");
+    }
+
+    let creds = Credentials::new(access_key_id, secret_access_key, None, None, 
"test");
+    let config = aws_sdk_s3::Config::builder()
+        .credentials_provider(creds)
+        .endpoint_url(endpoint_url)
+        .region(region)
+        .behavior_version(BehaviorVersion::v2024_03_28())
+        .build();
+
+    aws_sdk_s3::Client::from_conf(config)
+}
+
+async fn create_bucket(bucket_name: &str, client: &aws_sdk_s3::Client) {
+    match client.head_bucket().bucket(bucket_name).send().await {
+        Ok(_) => {}
+        Err(SdkError::ServiceError(err))
+            if matches!(
+                err.err(),
+                
aws_sdk_s3::operation::head_bucket::HeadBucketError::NotFound(_)
+            ) =>
+        {
+            client
+                .create_bucket()
+                .bucket(bucket_name)
+                .send()
+                .await
+                .expect("Failed to create bucket");
+        }
+        Err(e) => panic!("Failed to head bucket: {:?}", e),
+    }
+}
+
+async fn move_file_to_bucket(
+    from_path: &str,
+    to_path: &str,
+    bucket_name: &str,
+    client: &aws_sdk_s3::Client,
+) {
+    let body =
+        
aws_sdk_s3::primitives::ByteStream::from_path(std::path::Path::new(from_path))
+            .await
+            .expect("Failed to read file");
+
+    client
+        .put_object()
+        .bucket(bucket_name)
+        .key(to_path)
+        .body(body)
+        .send()
+        .await
+        .expect("Failed to put object");
+}
+
+#[tokio::test]
+async fn test_cli() {
+    if env::var("TEST_STORAGE_INTEGRATION").is_err() {
+        eprintln!("Skipping external storages integration tests");
+        return;
+    }
+
+    let client = s3_client().await;
+    create_bucket("cli", &client).await;
+    move_file_to_bucket(
+        "../datafusion/core/tests/data/cars.csv",
+        "cars.csv",
+        "cli",
+        &client,
+    )
+    .await;
+
+    let settings = make_settings();
+    let _bound = settings.bind_to_scope();
+
+    glob!("sql/*.sql", |path| {
+        let input = fs::read_to_string(path).unwrap();
+        assert_cmd_snapshot!(cli().pass_stdin(input))

Review Comment:
   FWIW we have had really nice luck in influxdb_iox using insta, and instead 
of using the external snap shot testing we used inline snapshots (so the 
expected results are inline with the test code)
   
   It looks like this:
   
   ```rust
      fn test_union_not_nested() {
           let plan = Arc::new(UnionExec::new(vec![other_node()]));
           let opt = NestedUnion;
           insta::assert_yaml_snapshot!(
               OptimizationTest::new(plan, opt),
               @r#"
           input:
             - " UnionExec"
             - "   EmptyExec"
           output:
             Ok:
               - " UnionExec"
               - "   EmptyExec"
           "#
           );
       }
   ```
   
   Is that possible with assert_cmt_snapshot?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to