Kimahriman commented on code in PR #865:
URL: https://github.com/apache/datafusion-comet/pull/865#discussion_r1728068522
##########
spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala:
##########
@@ -2003,10 +2003,17 @@ class CometExpressionSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
Seq(true, false).foreach { dictionaryEnabled =>
withTempDir { dir =>
val path = new Path(dir.toURI.toString, "test.parquet")
- makeParquetFileAllTypes(path, dictionaryEnabled = dictionaryEnabled,
10000)
+ makeParquetFileAllTypes(path, dictionaryEnabled = dictionaryEnabled,
1000)
val df = spark.read.parquet(path.toString)
checkSparkAnswerAndOperator(df.select(array(col("_2"), col("_3"),
col("_4"))))
checkSparkAnswerAndOperator(df.select(array(col("_4"), col("_11"),
lit(null))))
+ checkSparkAnswerAndOperator(
+ df.select(array(array(col("_4")), array(col("_4"), lit(null)))))
+ checkSparkAnswerAndOperator(df.select(array(col("_8"), col("_13"))))
+ // TODO: Some part of this converts the null to an empty string
+ // checkSparkAnswerAndOperator(df.select(array(col("_8"), col("_13"),
lit(null))))
Review Comment:
This one is really odd and I haven't been able to figure it out. Somewhere
along the way the `lit(null)` ends up becoming an empty string instead of null
whenever it has to get cast to a dictionary array. I logged the full plan for
this one
```
Projection: ProjectionExec { expr: [(ScalarFunctionExpr { fun: "<FUNC>",
name: "make_array", args: [CastExpr { expr: Column { name: "col_0", index: 0 },
cast_type: Dictionary(Int32, Utf8), cast_options: CastOptions { safe: false,
format_options: FormatOptions { safe: true, null: "", date_format: None,
datetime_format: None, timestamp_format: None, timestamp_tz_format: None,
time_format: None, duration_format: ISO8601 } } }, Column { name: "col_1",
index: 1 }, CastExpr { expr: Literal { value: Utf8(NULL) }, cast_type:
Dictionary(Int32, Utf8), cast_options: CastOptions { safe: false,
format_options: FormatOptions { safe: true, null: "", date_format: None,
datetime_format: None, timestamp_format: None, timestamp_tz_format: None,
time_format: None, duration_format: ISO8601 } } }], return_type: List(Field {
name: "item", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 0,
dict_is_ordered: false, metadata: {} }) }, "col_0")], schema: Schema { fields:
[Field { name: "col_0",
data_type: List(Field { name: "item", data_type: Dictionary(Int32, Utf8),
nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable:
true, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} },
input: ScanExec { exec_context_id: 0, input_source: Some(GlobalRef { inner:
GlobalRefGuard { obj: JObject { internal: 0x12010b128, lifetime:
PhantomData<&()> }, vm: JavaVM(0x10b7aeb10) } }), input_source_description:
"CometScan parquet (unknown)", data_types: [Utf8, Utf8], batch: Mutex { data:
Some(Batch([StringArray
[
null,
null,
"222222222222222222222222222222222222222222222222",
null,
null,
"111111111111111111111111111111111111111111111111",
null,
"333333333333333333333333333333333333333333333333",
"000000000000000000000000000000000000000000000000",
null,
...80 elements...,
"222222222222222222222222222222222222222222222222",
null,
null,
"111111111111111111111111111111111111111111111111",
"222222222222222222222222222222222222222222222222",
"333333333333333333333333333333333333333333333333",
null,
"111111111111111111111111111111111111111111111111",
"222222222222222222222222222222222222222222222222",
"333333333333333333333333333333333333333333333333",
], DictionaryArray {keys: PrimitiveArray<Int32>
[
null,
null,
0,
null,
null,
1,
null,
2,
3,
null,
...80 elements...,
0,
null,
null,
1,
0,
2,
null,
1,
0,
2,
] values: StringArray
[
"2",
"1",
"3",
"0",
]}
], 100)), poisoned: false, .. }, cache: PlanProperties { eq_properties:
EquivalenceProperties { eq_group: EquivalenceGroup { classes: [] }, oeq_class:
OrderingEquivalenceClass { orderings: [] }, constants: [], schema: Schema {
fields: [Field { name: "col_0", data_type: Utf8, nullable: true, dict_id: 0,
dict_is_ordered: false, metadata: {} }, Field { name: "col_1", data_type:
Dictionary(Int32, Utf8), nullable: true, dict_id: 1, dict_is_ordered: false,
metadata: {} }], metadata: {} } }, partitioning: UnknownPartitioning(1),
execution_mode: Bounded, output_ordering: None }, metrics:
ExecutionPlanMetricsSet { inner: Mutex { data: MetricsSet { metrics: [] } } }
}, metrics: ExecutionPlanMetricsSet { inner: Mutex { data: MetricsSet {
metrics: [] } } }, cache: PlanProperties { eq_properties: EquivalenceProperties
{ eq_group: EquivalenceGroup { classes: [] }, oeq_class:
OrderingEquivalenceClass { orderings: [] }, constants: [], schema: Schema {
fields: [Field { name: "col_0", data_type: Li
st(Field { name: "item", data_type: Dictionary(Int32, Utf8), nullable: true,
dict_id: 0, dict_is_ordered: false, metadata: {} }), nullable: true, dict_id:
0, dict_is_ordered: false, metadata: {} }], metadata: {} } }, partitioning:
UnknownPartitioning(1), execution_mode: Bounded, output_ordering: None } }
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]