alamb commented on code in PR #11711:
URL: https://github.com/apache/datafusion/pull/11711#discussion_r1695992999


##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -1034,29 +1033,50 @@ fn is_time_with_valid_unit(datatype: DataType) -> bool {
     )
 }
 
+/// Timezone coercion is handled by the following rules:
+/// - If only one has a timezone, coerce the other to match
+/// - If both have a timezone, coerce to the left type
+/// - "UTC" and "+00:00" are considered equivalent
+fn temporal_coercion_nonstrict_timezone(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+
+    match (lhs_type, rhs_type) {
+        (Timestamp(lhs_unit, lhs_tz), Timestamp(rhs_unit, rhs_tz)) => {
+            let tz = match (lhs_tz, rhs_tz) {
+                (Some(lhs_tz), Some(rhs_tz)) => {
+                    match (lhs_tz.as_ref(), rhs_tz.as_ref()) {
+                        // UTC and "+00:00" are the same by definition. Most 
other timezones
+                        // do not have a 1-1 mapping between timezone and an 
offset from UTC
+                        ("UTC", "+00:00") | ("+00:00", "UTC") => 
Some(Arc::clone(lhs_tz)),
+                        (_lhs, _rhs) => Some(Arc::clone(lhs_tz)),
+                    }
+                }
+                (Some(lhs_tz), None) => Some(Arc::clone(lhs_tz)),
+                (None, Some(rhs_tz)) => Some(Arc::clone(rhs_tz)),
+                (None, None) => None,
+            };
+
+            let unit = timeunit_coercion(lhs_unit, rhs_unit);
+
+            Some(Timestamp(unit, tz))
+        }
+        _ => temporal_coercion(lhs_type, rhs_type),
+    }
+}
+
 /// Coercion rules for Temporal columns: the type that both lhs and rhs can be
 /// casted to for the purpose of a date computation
 /// For interval arithmetic, it doesn't handle datetime type +/- interval
-fn temporal_coercion(lhs_type: &DataType, rhs_type: &DataType) -> 
Option<DataType> {
+fn temporal_coercion_strict_timezone(

Review Comment:
   Could you add a note to the comments saying how this is different than 
non-strict (specifically it requires that the timezones be exactly equal other 
than the special case for UTC)?



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -1034,29 +1033,50 @@ fn is_time_with_valid_unit(datatype: DataType) -> bool {
     )
 }
 
+/// Timezone coercion is handled by the following rules:
+/// - If only one has a timezone, coerce the other to match
+/// - If both have a timezone, coerce to the left type
+/// - "UTC" and "+00:00" are considered equivalent
+fn temporal_coercion_nonstrict_timezone(
+    lhs_type: &DataType,
+    rhs_type: &DataType,
+) -> Option<DataType> {
+    use arrow::datatypes::DataType::*;
+
+    match (lhs_type, rhs_type) {
+        (Timestamp(lhs_unit, lhs_tz), Timestamp(rhs_unit, rhs_tz)) => {
+            let tz = match (lhs_tz, rhs_tz) {
+                (Some(lhs_tz), Some(rhs_tz)) => {
+                    match (lhs_tz.as_ref(), rhs_tz.as_ref()) {
+                        // UTC and "+00:00" are the same by definition. Most 
other timezones
+                        // do not have a 1-1 mapping between timezone and an 
offset from UTC
+                        ("UTC", "+00:00") | ("+00:00", "UTC") => 
Some(Arc::clone(lhs_tz)),

Review Comment:
   since this case also returns the same value as the case below (`lhs_tz`), 
what value do we get special casing / checking for UTC ?
   
   As in why not just 
   ```rust
   // If both timestamps have a timezone, use the left hand timezone
   (Some(lhs_tz), Some(rhs_tz)) => Some(Arc::clone(lhs_tz)),
   ```



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -1725,6 +1774,27 @@ mod tests {
             DataType::LargeBinary
         );
 
+        // Timestamps
+        let utc: Option<Arc<str>> = Some("UTC".into());
+        test_coercion_binary_rule!(
+            DataType::Timestamp(TimeUnit::Second, utc.clone()),
+            DataType::Timestamp(TimeUnit::Second, utc.clone()),
+            Operator::Eq,
+            DataType::Timestamp(TimeUnit::Second, utc.clone())
+        );
+        test_coercion_binary_rule!(
+            DataType::Timestamp(TimeUnit::Second, utc.clone()),
+            DataType::Timestamp(TimeUnit::Second, 
Some("Europe/Brussels".into())),
+            Operator::Eq,
+            DataType::Timestamp(TimeUnit::Second, utc.clone())
+        );

Review Comment:
   Maybe we could also add a test showing that the lhs is still picked even 
with UTC
   
   Like this I think would pass.
   
   
   ```rust
           test_coercion_binary_rule!(
               DataType::Timestamp(TimeUnit::Second, 
Some("Europe/Brussels".into())),
               DataType::Timestamp(TimeUnit::Second, utc.clone()),
               Operator::Eq,
               DataType::Timestamp(TimeUnit::Second, 
Some("Europe/Brussels".into())),
           );
   ```
   
   Though I am a little confused if this is the intention (or if this case was 
supposed to coerce to UTC)



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -1034,29 +1033,50 @@ fn is_time_with_valid_unit(datatype: DataType) -> bool {
     )
 }
 
+/// Timezone coercion is handled by the following rules:

Review Comment:
   Could we add some rationale to this comment about why one might choose 
strict vs non-strict coercion? 
   
   It looks like nonstrict coercion is used for `values` and `comparison` 
coercion but not function arguments (which makes sense to me, as the function 
signature has some way to specify what timezones it will handle):
   
   
https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.TypeSignature.html#data-types
   
   



##########
datafusion/expr/src/type_coercion/binary.rs:
##########
@@ -1076,31 +1096,60 @@ fn temporal_coercion(lhs_type: &DataType, rhs_type: 
&DataType) -> Option<DataTyp
                 (None, None) => None,
             };
 
-            let unit = match (lhs_unit, rhs_unit) {
-                (Second, Millisecond) => Second,
-                (Second, Microsecond) => Second,
-                (Second, Nanosecond) => Second,
-                (Millisecond, Second) => Second,
-                (Millisecond, Microsecond) => Millisecond,
-                (Millisecond, Nanosecond) => Millisecond,
-                (Microsecond, Second) => Second,
-                (Microsecond, Millisecond) => Millisecond,
-                (Microsecond, Nanosecond) => Microsecond,
-                (Nanosecond, Second) => Second,
-                (Nanosecond, Millisecond) => Millisecond,
-                (Nanosecond, Microsecond) => Microsecond,
-                (l, r) => {
-                    assert_eq!(l, r);
-                    *l
-                }
-            };
+            let unit = timeunit_coercion(lhs_unit, rhs_unit);

Review Comment:
   this is a nice refactor



##########
datafusion/sqllogictest/test_files/timestamps.slt:
##########
@@ -3021,3 +3021,83 @@ drop view t_utc;
 
 statement ok
 drop view t_timezone;
+
+# test comparisons across timestamps
+statement ok
+create table t AS
+VALUES
+  ('2024-01-01T00:00:01Z'),
+  ('2024-02-01T00:00:01Z'),
+  ('2024-03-01T00:00:01Z')
+;
+
+statement ok
+create view t_utc as
+select column1::timestamp AT TIME ZONE 'UTC' as "column1"
+from t;
+
+statement ok
+create view t_europe as
+select column1::timestamp AT TIME ZONE 'Europe/Brussels' as "column1"
+from t;
+
+query P
+SELECT column1 FROM t_utc WHERE column1 < '2024-02-01T00:00:00' AT TIME ZONE 
'America/Los_Angeles';
+----
+2024-01-01T00:00:01Z
+2024-02-01T00:00:01Z
+
+query P
+SELECT column1 FROM t_europe WHERE column1 = '2024-01-31T16:00:01' AT TIME 
ZONE 'America/Los_Angeles';
+----
+2024-02-01T00:00:01+01:00
+
+query P
+SELECT column1 FROM t_europe WHERE column1 BETWEEN '2020-01-01T00:00:00' AT 
TIME ZONE 'Australia/Brisbane' AND '2024-02-01T00:00:00' AT TIME ZONE 
'America/Los_Angeles';
+----
+2024-01-01T00:00:01+01:00
+2024-02-01T00:00:01+01:00
+
+query P
+SELECT column1 FROM t_utc WHERE column1 IN ('2024-01-31T16:00:01' AT TIME ZONE 
'America/Los_Angeles');
+----
+2024-02-01T00:00:01Z
+
+query P
+SELECT column1 as u from t_utc UNION SELECT column1 from t_europe ORDER BY u;
+----
+2023-12-31T23:00:01Z
+2024-01-01T00:00:01Z
+2024-01-31T23:00:01Z
+2024-02-01T00:00:01Z
+2024-02-29T23:00:01Z
+2024-03-01T00:00:01Z
+
+query P
+SELECT column1 as e from t_europe UNION SELECT column1 from t_utc ORDER BY e;
+----
+2024-01-01T00:00:01+01:00
+2024-01-01T01:00:01+01:00
+2024-02-01T00:00:01+01:00
+2024-02-01T01:00:01+01:00
+2024-03-01T00:00:01+01:00
+2024-03-01T01:00:01+01:00
+
+query P
+SELECT nvl2(null, '2020-01-01T00:00:00-04:00'::timestamp, 
'2021-02-03T04:05:06Z'::timestamp)
+----
+2021-02-03T04:05:06
+
+query ?
+SELECT make_array('2020-01-01T00:00:00-04:00'::timestamp, 
'2021-01-01T01:02:03Z'::timestamp);
+----
+[2020-01-01T04:00:00, 2021-01-01T01:02:03]
+

Review Comment:
   I think another case to add to test the `coerce_values` would be something 
like this
   
   ```sql
   SELECT * FROM VALUES 
    ( '2023-12-31T23:00:01Z'),
    ('2024-02-01T00:00:00' AT TIME ZONE 'America/Los_Angeles');
   ```
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to