sergiimk commented on issue #22935:
URL: https://github.com/apache/datafusion/issues/22935#issuecomment-4697209001
Including two diffable plans in case it helps.
**Panics** (explicit schema with missing column):
```
CopyTo: format=parquet output_url=/tmp/.tmpuJv6jL/data.parquet options:
(single_file_output true)
Sort: offset ASC NULLS FIRST
Projection: CAST(CAST(row_number() PARTITION BY [Int32(1)] ORDER BY
[city ASC NULLS FIRST, op ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND
CURRENT ROW AS Decimal128(20, 0)) + Decimal128(Some(2),20,0) AS Int64) AS
offset, op, system_time, event_time, city, population, census_url
WindowAggr: windowExpr=[[row_number() PARTITION BY [Int32(1)] ORDER BY
[city ASC NULLS FIRST, op ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND
CURRENT ROW]]
Projection: op, assert_not_null(city) AS city,
assert_not_null(population) AS population, census_url, CASE WHEN event_time IS
NOT NULL THEN event_time ELSE TimestampMillisecond(946728000000, Some("UTC"))
END AS event_time, TimestampMillisecond(1262347200000, Some("UTC")) AS
system_time
Union
Projection: CASE WHEN old.city IS NULL THEN Int32(0) WHEN
__common_expr_1 THEN Int32(1) ELSE Int32(3) END AS op, CASE WHEN
__common_expr_1 THEN old.city ELSE new.city END AS city, CASE WHEN
__common_expr_1 THEN old.population ELSE new.population END AS population, CASE
WHEN __common_expr_1 THEN old.census_url ELSE new.census_url END AS census_url,
CASE WHEN __common_expr_1 THEN old.event_time ELSE new.event_time END AS
event_time
Projection: new.city IS NULL AS __common_expr_1,
old.event_time, old.city, old.population, old.census_url, new.city,
new.population, new.census_url, new.event_time
Filter: old.population IS DISTINCT FROM new.population OR
old.census_url IS DISTINCT FROM new.census_url OR new.event_time IS NOT NULL
AND old.event_time IS DISTINCT FROM new.event_time
Full Join: old.city = new.city
SubqueryAlias: old
Projection: ?table?.event_time, ?table?.city,
?table?.population, ?table?.census_url
Filter: __rank = UInt64(1) AND ?table?.op != Int32(1)
Projection: ?table?.op, ?table?.event_time,
?table?.city, ?table?.population, ?table?.census_url, __rank
WindowAggr: windowExpr=[[row_number() PARTITION
BY [?table?.city] ORDER BY [?table?.offset DESC NULLS LAST] ROWS BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW AS __rank]]
TableScan: ?table? projection=[offset, op,
event_time, city, population, census_url]
SubqueryAlias: new
Projection: ?table?.city, ?table?.population,
?table?.census_url, TimestampMillisecond(NULL, Some("UTC")) AS event_time
TableScan: ?table? projection=[city, population,
census_url]
Projection: Int32(2) AS op, old.city AS city, old.population AS
population, old.census_url AS census_url, old.event_time AS event_time
Inner Join: old.city = new.city Filter: old.population IS
DISTINCT FROM new.population OR old.census_url IS DISTINCT FROM new.census_url
OR new.event_time IS NOT NULL AND old.event_time IS DISTINCT FROM new.event_time
SubqueryAlias: old
Projection: ?table?.event_time, ?table?.city,
?table?.population, ?table?.census_url
Filter: __rank = UInt64(1) AND ?table?.op != Int32(1)
Projection: ?table?.op, ?table?.event_time,
?table?.city, ?table?.population, ?table?.census_url, __rank
WindowAggr: windowExpr=[[row_number() PARTITION BY
[?table?.city] ORDER BY [?table?.offset DESC NULLS LAST] ROWS BETWEEN UNBOUNDED
PRECEDING AND CURRENT ROW AS __rank]]
TableScan: ?table? projection=[offset, op,
event_time, city, population, census_url], partial_filters=[Boolean(true),
Boolean(true)]
SubqueryAlias: new
Projection: ?table?.city, ?table?.population,
?table?.census_url, TimestampMillisecond(NULL, Some("UTC")) AS event_time
Filter: Boolean(true)
TableScan: ?table? projection=[city, population,
census_url], partial_filters=[Boolean(true)]
```
**Doesn't panic** (inferred schema + missing column added as NULL literal):
```
CopyTo: format=parquet output_url=/tmp/.tmpuJv6jL/data.parquet options:
(single_file_output true)
Sort: offset ASC NULLS FIRST
Projection: CAST(CAST(row_number() PARTITION BY [Int32(1)] ORDER BY
[city ASC NULLS FIRST, op ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND
CURRENT ROW AS Decimal128(20, 0)) + Decimal128(Some(2),20,0) AS Int64) AS
offset, op, system_time, event_time, city, population, census_url
WindowAggr: windowExpr=[[row_number() PARTITION BY [Int32(1)] ORDER BY
[city ASC NULLS FIRST, op ASC NULLS FIRST] ROWS BETWEEN UNBOUNDED PRECEDING AND
CURRENT ROW]]
Projection: op, assert_not_null(city) AS city,
assert_not_null(population) AS population, census_url, CASE WHEN event_time IS
NOT NULL THEN event_time ELSE TimestampMillisecond(946728000000, Some("UTC"))
END AS event_time, TimestampMillisecond(1262347200000, Some("UTC")) AS
system_time
Union
Projection: CASE WHEN old.city IS NULL THEN Int32(0) WHEN
__common_expr_1 THEN Int32(1) ELSE Int32(3) END AS op, CASE WHEN
__common_expr_1 THEN old.city ELSE CAST(new.city AS Utf8View) END AS city, CASE
WHEN __common_expr_1 THEN old.population ELSE new.population END AS population,
CASE WHEN __common_expr_1 THEN old.census_url ELSE new.census_url END AS
census_url, CASE WHEN __common_expr_1 THEN old.event_time ELSE new.event_time
END AS event_time
Projection: new.city IS NULL AS __common_expr_1,
old.event_time, old.city, old.population, old.census_url, new.city,
new.population, new.census_url, new.event_time
Filter: old.population IS DISTINCT FROM new.population OR
old.census_url IS DISTINCT FROM new.census_url OR new.event_time IS NOT NULL
AND old.event_time IS DISTINCT FROM new.event_time
Full Join: old.city = CAST(new.city AS Utf8View)
SubqueryAlias: old
Projection: ?table?.event_time, ?table?.city,
?table?.population, census_url
Filter: __rank = UInt64(1) AND ?table?.op != Int32(1)
Projection: ?table?.op, ?table?.event_time,
?table?.city, ?table?.population, census_url, __rank
WindowAggr: windowExpr=[[row_number() PARTITION
BY [?table?.city] ORDER BY [?table?.offset DESC NULLS LAST] ROWS BETWEEN
UNBOUNDED PRECEDING AND CURRENT ROW AS __rank]]
Projection: ?table?.offset, ?table?.op,
?table?.event_time, ?table?.city, ?table?.population, Utf8(NULL) AS census_url
TableScan: ?table? projection=[offset, op,
event_time, city, population]
SubqueryAlias: new
Projection: ?table?.city, ?table?.population,
?table?.census_url, TimestampMillisecond(NULL, Some("UTC")) AS event_time
TableScan: ?table? projection=[city, population,
census_url]
Projection: Int32(2) AS op, old.city AS city, old.population AS
population, old.census_url AS census_url, old.event_time AS event_time
Inner Join: old.city = CAST(new.city AS Utf8View) Filter:
old.population IS DISTINCT FROM new.population OR old.census_url IS DISTINCT
FROM new.census_url OR new.event_time IS NOT NULL AND old.event_time IS
DISTINCT FROM new.event_time
SubqueryAlias: old
Projection: ?table?.event_time, ?table?.city,
?table?.population, census_url
Filter: __rank = UInt64(1) AND ?table?.op != Int32(1)
Projection: ?table?.op, ?table?.event_time,
?table?.city, ?table?.population, census_url, __rank
WindowAggr: windowExpr=[[row_number() PARTITION BY
[?table?.city] ORDER BY [?table?.offset DESC NULLS LAST] ROWS BETWEEN UNBOUNDED
PRECEDING AND CURRENT ROW AS __rank]]
Projection: ?table?.offset, ?table?.op,
?table?.event_time, ?table?.city, ?table?.population, Utf8(NULL) AS census_url
TableScan: ?table? projection=[offset, op,
event_time, city, population], partial_filters=[Boolean(true)]
SubqueryAlias: new
Projection: ?table?.city, ?table?.population,
?table?.census_url, TimestampMillisecond(NULL, Some("UTC")) AS event_time
TableScan: ?table? projection=[city, population,
census_url], partial_filters=[Boolean(true)]
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]