GitHub user TheIndifferent created a discussion: Query data with "WHERE" clause 
for nested objects

Hey everyone,

It seems like I am able to import the data structure with nested objects into 
DataFusion, but I cannot figure out how to query it, could someone please point 
me to the right example or how do I do that?

In the following example code, struct `A` has a field of type `B`, and I am 
trying to query all `A` where `B.e == {}`. I was trying to use `JOIN UNNEST` 
but I doubt that is the right approach on how to work with a data structure 
like that:
```rust
use datafusion::arrow::datatypes::*;
use datafusion::arrow::json::ReaderBuilder;
use datafusion::prelude::*;
use serde::{Deserialize, Serialize};
use std::io::Cursor;
use std::sync::Arc;

#[tokio::test]
async fn test_nested_unnest_query() {
    // Given:
    let ctx = SessionContext::new();

    let test_data = TestData {
        items: vec![
            A {
                c: 1,
                d: "first".to_string(),
                f: vec![
                    B { c: "x".to_string(), d: "y".to_string(), e: 
"target".to_string() },
                    B { c: "p".to_string(), d: "q".to_string(), e: 
"other".to_string() },
                ],
            },
            A {
                c: 2,
                d: "second".to_string(),
                f: vec![
                    B { c: "m".to_string(), d: "n".to_string(), e: 
"other".to_string() },
                ],
            },
        ],
    };

    let schema = create_schema();

    let json_data = serde_json::to_string(&test_data).unwrap();
    let cursor = Cursor::new(json_data.as_bytes());
    let mut reader = 
ReaderBuilder::new(Arc::new(schema)).build(cursor).unwrap();
    let batch = reader.next().unwrap().unwrap();
    ctx.register_batch("test_data", batch).unwrap();

    // When:
    let query = "
        SELECT a.c, a.d, b.e
        FROM test_data
        CROSS JOIN UNNEST(items) AS a
        CROSS JOIN UNNEST(a.f) AS b
        WHERE b.e = 'target'
    ";

    let result = ctx.sql(query).await.unwrap();

    // Then:
    let batches = result.collect().await.unwrap();
    let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
    assert_eq!(total_rows, 1, "Should find exactly 1 record with b.e = 
'target'");
}

fn create_schema() -> Schema {
    Schema::new(vec![Field::new(
        "items",
        DataType::List(Arc::new(Field::new(
            "item",
            DataType::Struct(Fields::from(vec![
                Field::new("c", DataType::UInt32, false),
                Field::new("d", DataType::Utf8, false),
                Field::new(
                    "f",
                    DataType::List(Arc::new(Field::new(
                        "item",
                        DataType::Struct(Fields::from(vec![
                            Field::new("c", DataType::Utf8, false),
                            Field::new("d", DataType::Utf8, false),
                            Field::new("e", DataType::Utf8, false),
                        ])),
                        true,
                    ))),
                    false,
                ),
            ])),
            true,
        ))),
        false,
    )])
}

#[derive(Serialize, Deserialize)]
struct TestData {
    items: Vec<A>,
}

#[derive(Serialize, Deserialize)]
struct A {
    c: u32,
    d: String,
    f: Vec<B>,
}

#[derive(Serialize, Deserialize)]
struct B {
    c: String,
    d: String,
    e: String,
}

```

GitHub link: https://github.com/apache/datafusion/discussions/17714

----
This is an automatically sent email for [email protected].
To unsubscribe, please send an email to: 
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to