This is an automated email from the ASF dual-hosted git repository.

npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b0734e68d6 ARROW-14821: [R] Implement bindings for lubridate's 
floor_date, ceiling_date, and round_date (#12154)
b0734e68d6 is described below

commit b0734e68d6f57fb22869df0d0aa2ae4dd75765dc
Author: Danielle Navarro <[email protected]>
AuthorDate: Fri Jul 22 08:31:02 2022 +1000

    ARROW-14821: [R] Implement bindings for lubridate's floor_date, 
ceiling_date, and round_date (#12154)
    
    This patch provides dplyr bindings to for lubridate functions 
`floor_date()`, `ceiling_date()`, and `round_date()`. This is my first attempt 
at writing a patch, so my apologies if I've made any errors 🙂
    
    ### Supported functionality:
    
    - Allows rounding to integer multiples of common time units (second, 
minutes, days, etc)
    - Mirrors the lubridate syntax allowing fractional units such as `unit = 
.001 seconds` as an alias for `unit = 1 millisecond`
    - Allows partial matching of date units based on first three characters: 
e.g. `sec`, `second`, `seconds` all match `second`
    - Mirrors lubridate in throwing errors when unit exceeds thresholds: 60 
seconds, 60 minutes, 24 hours
    
    ~~### Major problems not yet addressed:~~
    
    ~~- Does not yet support the `week_start` argument, and implicitly fixes 
`week_start = 4`~~
    ~~- Does not yet mirror lubridate handling of timezones~~
    
    ~~I'd prefer to fix these two issues before merging, but I'm uncertain how 
best to handle them. Any advice would be appreciated!~~
    
    ~~### Minor things not yet addressed~~
    
    ~~- During rounding lubridate sometimes coerces Date objects to POSIXct. 
This is not mirrored in the arrow bindings: date32 classes remain date32 
classes. This introduces minor differences in rounding in some cases~~
    ~~- Does not yet support the `change_on_boundary` argument to 
`ceiling_date()`. It's a small discrepancy, but it means that the default 
behaviour of the arrow dplyr binding mirrors lubridate prior to v1.6.0~~
    
    EDIT: issues now addressed!
    
    
    Authored-by: Danielle Navarro <[email protected]>
    Signed-off-by: Neal Richardson <[email protected]>
---
 r/R/dplyr-datetime-helpers.R                 | 158 ++++++++
 r/R/dplyr-funcs-datetime.R                   |  52 +++
 r/tests/testthat/test-dplyr-funcs-datetime.R | 578 +++++++++++++++++++++++++++
 3 files changed, 788 insertions(+)

diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R
index 9199ce0dd5..efcc62ff4e 100644
--- a/r/R/dplyr-datetime-helpers.R
+++ b/r/R/dplyr-datetime-helpers.R
@@ -417,3 +417,161 @@ build_strptime_exprs <- function(x, formats) {
     )
   )
 }
+
+# This function parses the "unit" argument to round_date, floor_date, and
+# ceiling_date. The input x is a single string like "second", "3 seconds",
+# "10 microseconds" or "2 secs" used to specify the size of the unit to
+# which the temporal data should be rounded. The matching rules implemented
+# are designed to mirror lubridate exactly: it extracts the numeric multiple
+# from the start of the string (presumed to be 1 if no number is present)
+# and selects the unit by looking at the first 3 characters only. This choice
+# ensures that "secs", "second", "microsecs" etc are all valid, but it is
+# very permissive and would interpret "mickeys" as microseconds. This
+# permissive implementation mirrors the corresponding implementation in
+# lubridate. The return value is a list with integer-valued components
+# "multiple" and  "unit"
+parse_period_unit <- function(x) {
+  # the regexp matches against fractional units, but per lubridate
+  # supports integer multiples of a known unit only
+  match_info <- regexpr(
+    pattern = " *(?<multiple>[0-9.,]+)? *(?<unit>[^ \t\n]+)",
+    text = x[[1]],
+    perl = TRUE
+  )
+
+  capture_start <- attr(match_info, "capture.start")
+  capture_length <- attr(match_info, "capture.length")
+  capture_end <- capture_start + capture_length - 1L
+
+  str_unit <- substr(x, capture_start[[2]], capture_end[[2]])
+  str_multiple <- substr(x, capture_start[[1]], capture_end[[1]])
+
+  known_units <- c("nanosecond", "microsecond", "millisecond", "second",
+                   "minute", "hour", "day", "week", "month", "quarter", "year")
+
+  # match the period unit
+  str_unit_start <- substr(str_unit, 1, 3)
+  unit <- as.integer(pmatch(str_unit_start, known_units)) - 1L
+
+  if (any(is.na(unit))) {
+    abort(
+      sprintf(
+        "Invalid period name: '%s'",
+        str_unit,
+        ". Known units are",
+        oxford_paste(known_units, "and")
+      )
+    )
+  }
+
+  # empty string in multiple interpreted as 1
+  if (capture_length[[1]] == 0) {
+    multiple <- 1L
+
+  # otherwise parse the multiple
+  } else {
+    multiple <- as.numeric(str_multiple)
+
+    # special cases: interpret fractions of 1 second as integer
+    # multiples of nanoseconds, microseconds, or milliseconds
+    # to mirror lubridate syntax
+    if (unit == 3L) {
+      if (multiple < 10^-6) {
+        unit <- 0L
+        multiple <- 10^9 * multiple
+      }
+      if (multiple < 10^-3) {
+        unit <- 1L
+        multiple <- 10^6 * multiple
+      }
+      if (multiple < 1) {
+        unit <- 2L
+        multiple <- 10^3 * multiple
+      }
+    }
+
+    multiple <- as.integer(multiple)
+  }
+
+  # more special cases: lubridate imposes sensible maximum
+  # values on the number of seconds, minutes and hours
+  if (unit == 3L && multiple > 60) {
+    abort("Rounding with second > 60 is not supported")
+  }
+  if (unit == 4L && multiple > 60) {
+    abort("Rounding with minute > 60 is not supported")
+  }
+  if (unit == 5L && multiple > 24) {
+    abort("Rounding with hour > 24 is not supported")
+  }
+
+  list(unit = unit, multiple = multiple)
+}
+
+# This function handles round/ceil/floor when unit is week. The fn argument
+# specifies which of the temporal rounding functions (round_date, etc) is to
+# be applied, x is the data argument to the rounding function, week_start is
+# an integer indicating which day of the week is the start date. The C++
+# library natively handles Sunday and Monday so in those cases we pass the
+# week_starts_monday option through. Other week_start values are handled here
+shift_temporal_to_week <- function(fn, x, week_start, options) {
+  if (week_start == 7) { # Sunday
+    options$week_starts_monday <- FALSE
+    return(Expression$create(fn, x, options = options))
+  }
+
+  if (week_start == 1) { # Monday
+    options$week_starts_monday <- TRUE
+    return(Expression$create(fn, x, options = options))
+  }
+
+  # other cases use offset-from-Monday: to ensure type-stable output there
+  # are two separate helpers, one to handle date32 input and the other to
+  # handle timestamps
+  options$week_starts_monday <- TRUE
+  offset <- as.integer(week_start) - 1
+
+  is_date32 <- inherits(x, "Date") ||
+    (inherits(x, "Expression") && x$type_id() == Type$DATE32)
+
+  if (is_date32) {
+    shifted_date <- shift_date32_to_week(fn, x, offset, options = options)
+  } else {
+    shifted_date <- shift_timestamp_to_week(fn, x, offset, options = options)
+  }
+
+  shifted_date
+}
+
+# timestamp input should remain timestamp
+shift_timestamp_to_week <- function(fn, x, offset, options) {
+  offset_seconds <- build_expr(
+    "cast",
+    Scalar$create(offset * 86400L, int64()),
+    options = cast_options(to_type = duration(unit = "s"))
+  )
+  shift_offset <- build_expr(fn, x - offset_seconds, options = options)
+
+  shift_offset + offset_seconds
+}
+
+# to avoid date32 types being cast to timestamp during the temporal
+# arithmetic, the offset logic needs to use the count in days and
+# use integer arithmetic: this feels inelegant, but it ensures that
+# temporal rounding functions remain type stable
+shift_date32_to_week <- function(fn, x, offset, options) {
+  # offset the date
+  offset <- Expression$scalar(Scalar$create(offset, int32()))
+  x_int <- build_expr("cast", x, options = cast_options(to_type = int32()))
+  x_int_offset <- x_int - offset
+  x_offset <- build_expr("cast", x_int_offset, options = cast_options(to_type 
= date32()))
+
+  # apply round/floor/ceil
+  shift_offset <- build_expr(fn, x_offset, options = options)
+
+  # undo offset and return
+  shift_int_offset <- build_expr("cast", shift_offset, options = 
cast_options(to_type = int32()))
+  shift_int <- shift_int_offset + offset
+
+  build_expr("cast", shift_int, options = cast_options(to_type = date32()))
+}
diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R
index 1db6c647d5..fd705e4578 100644
--- a/r/R/dplyr-funcs-datetime.R
+++ b/r/R/dplyr-funcs-datetime.R
@@ -25,6 +25,7 @@ register_bindings_datetime <- function() {
   register_bindings_duration_constructor()
   register_bindings_duration_helpers()
   register_bindings_datetime_parsers()
+  register_bindings_datetime_rounding()
 }
 
 register_bindings_datetime_utility <- function() {
@@ -637,4 +638,55 @@ register_bindings_datetime_parsers <- function() {
 
     build_expr("assume_timezone", coalesce_output, options = list(timezone = 
tz))
   })
+
+}
+
+register_bindings_datetime_rounding <- function() {
+  register_binding(
+    "round_date",
+    function(x,
+             unit = "second",
+             week_start = getOption("lubridate.week.start", 7)) {
+
+    opts <- parse_period_unit(unit)
+    if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
+      return(shift_temporal_to_week("round_temporal", x, week_start, options = 
opts))
+    }
+
+    Expression$create("round_temporal", x, options = opts)
+  })
+
+  register_binding(
+    "floor_date",
+    function(x,
+             unit = "second",
+             week_start = getOption("lubridate.week.start", 7)) {
+
+    opts <- parse_period_unit(unit)
+    if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
+      return(shift_temporal_to_week("floor_temporal", x, week_start, options = 
opts))
+    }
+
+    Expression$create("floor_temporal", x, options = opts)
+  })
+
+  register_binding(
+    "ceiling_date",
+    function(x,
+             unit = "second",
+             change_on_boundary = NULL,
+             week_start = getOption("lubridate.week.start", 7)) {
+    opts <- parse_period_unit(unit)
+    if (is.null(change_on_boundary)) {
+      change_on_boundary <- ifelse(call_binding("is.Date", x), TRUE, FALSE)
+    }
+    opts$ceil_is_strictly_greater <- change_on_boundary
+
+    if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
+      return(shift_temporal_to_week("ceil_temporal", x, week_start, options = 
opts))
+    }
+
+    Expression$create("ceil_temporal", x, options = opts)
+  })
+
 }
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R 
b/r/tests/testthat/test-dplyr-funcs-datetime.R
index 6caf061fc8..1c6b73d7af 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -2515,3 +2515,581 @@ test_that("build_formats() and 
build_format_from_order()", {
     )
   )
 })
+
+
+
+# tests for datetime rounding ---------------------------------------------
+
+# an "easy" date to avoid conflating tests of different things (i.e., it's
+# UTC time, and not one of the edge cases on or extremely close to the
+# rounding boundaty)
+easy_date <- as.POSIXct("2022-10-11 12:00:00", tz = "UTC")
+easy_df <- tibble::tibble(datetime = easy_date)
+
+# dates near month boundaries over the course of 1 year
+month_boundaries <- c(
+  "2021-01-01 00:01:00", "2021-02-01 00:01:00", "2021-03-01 00:01:00",
+  "2021-04-01 00:01:00", "2021-05-01 00:01:00", "2021-06-01 00:01:00",
+  "2021-07-01 00:01:00", "2021-08-01 00:01:00", "2021-09-01 00:01:00",
+  "2021-10-01 00:01:00", "2021-11-01 00:01:00", "2021-12-01 00:01:00",
+  "2021-01-31 23:59:00", "2021-02-28 23:59:00", "2021-03-31 23:59:00",
+  "2021-04-30 23:59:00", "2021-05-31 23:59:00", "2021-06-30 23:59:00",
+  "2021-07-31 23:59:00", "2021-08-31 23:59:00", "2021-09-30 23:59:00",
+  "2021-10-31 23:59:00", "2021-11-30 23:59:00", "2021-12-31 23:59:00"
+)
+year_of_dates <- tibble::tibble(
+  datetime = as.POSIXct(month_boundaries, tz = "UTC"),
+  date = as.Date(datetime)
+)
+
+# test case used to check we catch week boundaries for all week_start values
+fortnight <- tibble::tibble(
+  date = seq(
+    from = as.Date("2022-04-04"),
+    to = as.Date("2022-04-17"),
+    by = "day"
+  ),
+  datetime = as.POSIXct(date)
+)
+
+# test case to check we catch interval lower boundaries for ceiling_date
+boundary_times <- tibble::tibble(
+  datetime = as.POSIXct(strptime(c(
+    "2022-05-10 00:00:00", # boundary for week when week_start = 7 (Sunday)
+    "2022-05-11 00:00:00", # boundary for week when week_start = 1 (Monday)
+    "2022-05-12 00:00:00", # boundary for week when week_start = 2 (Tuesday)
+    "2022-03-10 00:00:00", # boundary for day, hour, minute, second, 
millisecond
+    "2022-03-10 00:00:01", # boundary for second, millisecond
+    "2022-03-10 00:01:00", # boundary for second, millisecond, minute
+    "2022-03-10 01:00:00", # boundary for second, millisecond, minute, hour
+    "2022-01-01 00:00:00"  # boundary for year
+  ), tz = "UTC", format = "%F %T")),
+  date = as.Date(datetime)
+)
+
+# test case to check rounding takes place in local time
+datestrings <- c(
+  "1970-01-01 00:00:59.123456789",
+  "2000-02-29 23:23:23.999999999",
+  "1899-01-01 00:59:20.001001001",
+  "2033-05-18 03:33:20.000000000",
+  "2020-01-01 01:05:05.001",
+  "2019-12-31 02:10:10.002",
+  "2019-12-30 03:15:15.003",
+  "2009-12-31 04:20:20.004132",
+  "2010-01-01 05:25:25.005321",
+  "2010-01-03 06:30:30.006163",
+  "2010-01-04 07:35:35",
+  "2006-01-01 08:40:40",
+  "2005-12-31 09:45:45",
+  "2008-12-28 00:00:00",
+  "2008-12-29 00:00:00",
+  "2012-01-01 01:02:03"
+)
+tz_times <- tibble::tibble(
+  utc_time = as.POSIXct(datestrings, tz = "UTC"),
+  syd_time = as.POSIXct(datestrings, tz = "Australia/Sydney"),   # UTC +10   
(UTC +11 with DST)
+  adl_time = as.POSIXct(datestrings, tz = "Australia/Adelaide"), # UTC +9:30 
(UTC +10:30 with DST)
+  mar_time = as.POSIXct(datestrings, tz = "Pacific/Marquesas"),  # UTC -9:30 
(no DST)
+  kat_time = as.POSIXct(datestrings, tz = "Asia/Kathmandu")      # UTC +5:45 
(no DST)
+)
+
+test_that("timestamp round/floor/ceiling works for a minimal test", {
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        round_datetime = round_date(datetime),
+        floor_datetime = floor_date(datetime),
+        ceiling_datetime = ceiling_date(datetime, change_on_boundary = FALSE)
+      ) %>%
+      collect(),
+    test_df
+  )
+})
+
+test_that("timestamp round/floor/ceiling accepts period unit abbreviation", {
+
+  # test helper to ensure standard abbreviations of period names
+  # are understood by arrow and mirror the lubridate behaviour
+  check_period_abbreviation <- function(unit, synonyms) {
+
+    # check arrow against lubridate
+    compare_dplyr_binding(
+      .input %>%
+        mutate(out_1 = round_date(datetime, unit)) %>%
+        collect(),
+      easy_df
+    )
+
+    # check synonyms
+    base <- call_binding("round_date", Expression$scalar(easy_date), unit)
+    for (syn in synonyms) {
+      expect_equal(
+        call_binding("round_date", Expression$scalar(easy_date), syn),
+        base
+      )
+    }
+  }
+
+  check_period_abbreviation("minute", synonyms = c("minutes", "min", "mins"))
+  check_period_abbreviation("second", synonyms = c("seconds", "sec", "secs"))
+  check_period_abbreviation("month", synonyms = c("months", "mon", "mons"))
+})
+
+test_that("temporal round/floor/ceiling accepts periods with multiple units", {
+
+  check_multiple_unit_period <- function(unit, multiplier) {
+    unit_string <- paste(multiplier, unit)
+    compare_dplyr_binding(
+      .input %>%
+        mutate(
+          round_datetime = round_date(datetime, unit_string),
+          floor_datetime = floor_date(datetime, unit_string),
+          ceiling_datetime = ceiling_date(datetime, unit_string)
+        ) %>%
+        collect(),
+      easy_df
+    )
+  }
+
+  for (multiplier in c(1, 2, 10)) {
+    for (unit in c("second", "minute", "day", "year")) {
+      check_multiple_unit_period(unit, multiplier)
+    }
+  }
+})
+
+# Test helper functions for checking equivalence of outputs regardless of
+# the unit specified. The lubridate_unit argument allows for cases where
+# arrow supports a unit name (e.g., nanosecond) that lubridate doesn't. Also
+# note that in the check_date_rounding helper the lubridate output is coerced
+# to ensure type stable output (arrow output should be type stable without 
this)
+
+check_date_rounding <- function(data, unit, lubridate_unit = unit, ...) {
+  expect_equal(
+    data %>%
+      arrow_table() %>%
+      mutate(
+        date_rounded = round_date(date, unit),
+        date_floored = floor_date(date, unit),
+        date_ceiling = ceiling_date(date, unit)
+      ) %>%
+      collect(),
+    data %>%
+      mutate(
+        date_rounded = as.Date(round_date(date, lubridate_unit)),
+        date_floored = as.Date(floor_date(date, lubridate_unit)),
+        date_ceiling = as.Date(ceiling_date(date, lubridate_unit))
+      ),
+    ...
+  )
+}
+
+check_timestamp_rounding <- function(data, unit, lubridate_unit = unit, ...) {
+
+  expect_equal(
+    data %>%
+      arrow_table() %>%
+      mutate(
+        datetime_rounded = round_date(datetime, unit),
+        datetime_floored = floor_date(datetime, unit),
+        datetime_ceiling = ceiling_date(datetime, unit)
+      ) %>%
+      collect(),
+    data %>%
+      mutate(
+        datetime_rounded = round_date(datetime, lubridate_unit),
+        datetime_floored = floor_date(datetime, lubridate_unit),
+        datetime_ceiling = ceiling_date(datetime, lubridate_unit)
+      ),
+    ...
+  )
+}
+
+test_that("date round/floor/ceil works for units of 1 day or less", {
+
+  test_df %>% check_date_rounding("1 millisecond", lubridate_unit = ".001 
second")
+  test_df %>% check_date_rounding("1 day")
+  test_df %>% check_date_rounding("1 second")
+  test_df %>% check_date_rounding("1 hour")
+
+})
+
+test_that("timestamp round/floor/ceil works for units of 1 day or less", {
+
+  test_df %>% check_timestamp_rounding("second")
+  test_df %>% check_timestamp_rounding("minute")
+  test_df %>% check_timestamp_rounding("hour")
+  test_df %>% check_timestamp_rounding("day")
+
+  test_df %>% check_timestamp_rounding(".01 second")
+  test_df %>% check_timestamp_rounding(".001 second")
+  test_df %>% check_timestamp_rounding(".00001 second")
+
+  test_df %>% check_timestamp_rounding("1 millisecond", lubridate_unit = ".001 
second")
+  test_df %>% check_timestamp_rounding("1 microsecond", lubridate_unit = 
".000001 second")
+  test_df %>% check_timestamp_rounding("1 nanosecond", lubridate_unit = 
".000000001 second")
+
+})
+
+test_that("timestamp round/floor/ceil works for units: month/quarter/year", {
+
+  year_of_dates %>% check_timestamp_rounding("month", ignore_attr = TRUE)
+  year_of_dates %>% check_timestamp_rounding("quarter", ignore_attr = TRUE)
+  year_of_dates %>% check_timestamp_rounding("year", ignore_attr = TRUE)
+
+})
+
+# check helper invoked when we need to avoid the lubridate rounding bug
+check_date_rounding_1051_bypass <- function(data, unit, ignore_attr = TRUE, 
...) {
+
+  # directly compare arrow to lubridate for floor and ceiling
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        date_floored = floor_date(date, unit),
+        date_ceiling = ceiling_date(date, unit)
+      ) %>%
+      collect(),
+    data,
+    ignore_attr = ignore_attr,
+    ...
+  )
+
+  # The rounding tests for dates is run against Arrow timestamp behaviour
+  # because of a lubridate bug specific to Date objects with week and
+  # higher-unit rounding (see lubridate issue 1051)
+  # https://github.com/tidyverse/lubridate/issues/1051
+  out <- data %>%
+    arrow_table() %>%
+    mutate(
+      out_date = date %>% round_date(unit), # Date
+      out_time = datetime %>% round_date(unit) # POSIXct
+    ) %>%
+    collect()
+
+  expect_equal(
+    out$out_date,
+    as.Date(out$out_time)
+  )
+}
+
+test_that("date round/floor/ceil works for units: month/quarter/year", {
+
+  # these test cases are affected by lubridate issue 1051 so we bypass
+  # lubridate::round_date() for Date objects with large rounding units
+  # https://github.com/tidyverse/lubridate/issues/1051
+
+  check_date_rounding_1051_bypass(year_of_dates, "month", ignore_attr = TRUE)
+  check_date_rounding_1051_bypass(year_of_dates, "quarter", ignore_attr = TRUE)
+  check_date_rounding_1051_bypass(year_of_dates, "year", ignore_attr = TRUE)
+
+})
+
+check_date_week_rounding <- function(data, week_start, ignore_attr = TRUE, 
...) {
+  expect_equal(
+    data %>%
+      arrow_table() %>%
+      mutate(
+        date_rounded = round_date(date, unit),
+        date_floored = floor_date(date, unit),
+        date_ceiling = ceiling_date(date, unit)
+      ) %>%
+      collect(),
+    data %>%
+      mutate(
+        date_rounded = as.Date(round_date(date, lubridate_unit)),
+        date_floored = as.Date(floor_date(date, lubridate_unit)),
+        date_ceiling = as.Date(ceiling_date(date, lubridate_unit))
+      ),
+    ignore_attr = ignore_attr,
+    ...
+  )
+}
+
+check_timestamp_week_rounding <- function(data, week_start, ignore_attr = 
TRUE, ...) {
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        datetime_rounded = round_date(datetime, "week", week_start = 
week_start),
+        datetime_floored = floor_date(datetime, "week", week_start = 
week_start),
+        datetime_ceiling = ceiling_date(datetime, "week", week_start = 
week_start)
+      ) %>%
+      collect(),
+    data,
+    ignore_attr = ignore_attr,
+    ...
+  )
+}
+
+test_that("timestamp round/floor/ceil works for week units (standard 
week_start)", {
+
+  fortnight %>% check_timestamp_week_rounding(week_start = 1) # Monday
+  fortnight %>% check_timestamp_week_rounding(week_start = 7) # Sunday
+
+})
+
+test_that("timestamp round/floor/ceil works for week units (non-standard 
week_start)", {
+
+  fortnight %>% check_timestamp_week_rounding(week_start = 2) # Tuesday
+  fortnight %>% check_timestamp_week_rounding(week_start = 3) # Wednesday
+  fortnight %>% check_timestamp_week_rounding(week_start = 4) # Thursday
+  fortnight %>% check_timestamp_week_rounding(week_start = 5) # Friday
+  fortnight %>% check_timestamp_week_rounding(week_start = 6) # Saturday
+
+})
+
+check_date_week_rounding <- function(data, week_start, ignore_attr = TRUE, 
...) {
+
+  # directly compare arrow to lubridate for floor and ceiling
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        date_floored = floor_date(date, "week", week_start = week_start),
+        date_ceiling = ceiling_date(date, "week", week_start = week_start)
+      ) %>%
+      collect(),
+    data,
+    ignore_attr = ignore_attr,
+    ...
+  )
+
+  # use the bypass method to avoid the lubridate-1051 bug for week units
+  # https://github.com/tidyverse/lubridate/issues/1051
+  out <- data %>%
+    arrow_table() %>%
+    mutate(
+      out_date = date %>% round_date("week", week_start = week_start), # Date
+      out_time = datetime %>% round_date("week", week_start = week_start) # 
POSIXct
+    ) %>%
+    collect()
+
+  expect_equal(
+    out$out_date,
+    as.Date(out$out_time)
+  )
+}
+
+test_that("date round/floor/ceil works for week units (standard week_start)", {
+
+  check_date_week_rounding(fortnight, week_start = 1) # Monday
+  check_date_week_rounding(fortnight, week_start = 7) # Sunday
+
+})
+
+test_that("date round/floor/ceil works for week units (non-standard 
week_start)", {
+
+  check_date_week_rounding(fortnight, week_start = 2) # Tuesday
+  check_date_week_rounding(fortnight, week_start = 3) # Wednesday
+  check_date_week_rounding(fortnight, week_start = 4) # Thursday
+  check_date_week_rounding(fortnight, week_start = 5) # Friday
+  check_date_week_rounding(fortnight, week_start = 6) # Saturday
+
+})
+
+# Test helper used to check that the change_on_boundary argument to
+# ceiling_date behaves identically to the lubridate version. It takes
+# unit as an argument to run tests separately for different rounding units
+check_boundary_with_unit <- function(unit, ...) {
+
+  # timestamps
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        cob_null = ceiling_date(datetime, unit, change_on_boundary = NULL),
+        cob_true = ceiling_date(datetime, unit, change_on_boundary = TRUE),
+        cob_false = ceiling_date(datetime, unit, change_on_boundary = FALSE)
+      ) %>%
+      collect(),
+    boundary_times,
+    ...
+  )
+
+  # dates
+  expect_equal(
+    boundary_times %>%
+      arrow_table() %>%
+      mutate(
+        cob_null = ceiling_date(date, unit, change_on_boundary = NULL),
+        cob_true = ceiling_date(date, unit, change_on_boundary = TRUE),
+        cob_false = ceiling_date(date, unit, change_on_boundary = FALSE)
+      ) %>%
+      collect(),
+    boundary_times %>%
+      mutate(
+        cob_null = as.Date(ceiling_date(date, unit, change_on_boundary = 
NULL)),
+        cob_true = as.Date(ceiling_date(date, unit, change_on_boundary = 
TRUE)),
+        cob_false = as.Date(ceiling_date(date, unit, change_on_boundary = 
FALSE))
+      ),
+    ...
+  )
+
+
+}
+
+test_that("ceiling_date() applies change_on_boundary correctly", {
+  check_boundary_with_unit(".001 second")
+  check_boundary_with_unit("second")
+  check_boundary_with_unit("minute", tolerance = .001) # floating point issue?
+  check_boundary_with_unit("hour")
+  check_boundary_with_unit("day")
+})
+
+# In lubridate, an error is thrown when 60 sec/60 min/24 hour thresholds are
+# exceeded. Checks that arrow mimics this behaviour and throws an identically
+# worded error message
+test_that("temporal round/floor/ceil period unit maxima are enforced", {
+
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"),
+    "Rounding with second > 60 is not supported"
+  )
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"),
+    "Rounding with minute > 60 is not supported"
+  )
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"),
+    "Rounding with hour > 24 is not supported"
+  )
+  expect_error(
+    call_binding("round_date", Expression$scalar(Sys.Date()), "25 hours"),
+    "Rounding with hour > 24 is not supported"
+  )
+
+})
+
+# one method to test that temporal rounding takes place in local time is to
+# use lubridate as a ground truth and compare arrow results to lubridate
+# results. this test helper runs that test, skipping cases where lubridate
+# produces incorrect answers
+check_timezone_rounding_vs_lubridate <- function(data, unit) {
+
+  # esoteric lubridate bug: on windows only (not ubuntu), lubridate returns
+  # incorrect ceiling/floor for timezoned POSIXct times (syd, adl, kat zones,
+  # but not mar) but not utc, and not for round, and only for these two
+  # timestamps where high-precision timing is relevant to the outcome
+  if (unit %in% c(".001 second", "second", "minute")) {
+    if (tolower(Sys.info()[["sysname"]]) == "windows") {
+      data <- data[-c(1, 3), ]
+    }
+  }
+
+  # external validity check: compare lubridate to arrow
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        utc_floored = floor_date(utc_time, unit = unit),
+        utc_rounded = round_date(utc_time, unit = unit),
+        utc_ceiling = ceiling_date(utc_time, unit = unit),
+        syd_floored = floor_date(syd_time, unit = unit),
+        syd_rounded = round_date(syd_time, unit = unit),
+        syd_ceiling = ceiling_date(syd_time, unit = unit),
+        adl_floored = floor_date(adl_time, unit = unit),
+        adl_rounded = round_date(adl_time, unit = unit),
+        adl_ceiling = ceiling_date(adl_time, unit = unit),
+        mar_floored = floor_date(mar_time, unit = unit),
+        mar_rounded = round_date(mar_time, unit = unit),
+        mar_ceiling = ceiling_date(mar_time, unit = unit),
+        kat_floored = floor_date(kat_time, unit = unit),
+        kat_rounded = round_date(kat_time, unit = unit),
+        kat_ceiling = ceiling_date(kat_time, unit = unit)
+      ) %>%
+      collect(),
+    data
+  )
+
+}
+
+# another method to check that temporal rounding takes place in local
+# time is to test the internal consistency of the YMD HMS values returned
+# by temporal rounding functions: these should be the same regardless of
+# timezone and should always be identical to the equivalent result calculated
+# for UTC test. this test isn't useful for subsecond resolution but avoids
+# dependency on lubridate
+check_timezone_rounding_for_consistency <- function(data, unit) {
+
+  shifted_times <- data %>%
+    arrow_table() %>%
+    mutate(
+      utc_floored = floor_date(utc_time, unit = unit),
+      utc_rounded = round_date(utc_time, unit = unit),
+      utc_ceiling = ceiling_date(utc_time, unit = unit),
+      syd_floored = floor_date(syd_time, unit = unit),
+      syd_rounded = round_date(syd_time, unit = unit),
+      syd_ceiling = ceiling_date(syd_time, unit = unit),
+      adl_floored = floor_date(adl_time, unit = unit),
+      adl_rounded = round_date(adl_time, unit = unit),
+      adl_ceiling = ceiling_date(adl_time, unit = unit),
+      mar_floored = floor_date(mar_time, unit = unit),
+      mar_rounded = round_date(mar_time, unit = unit),
+      mar_ceiling = ceiling_date(mar_time, unit = unit),
+      kat_floored = floor_date(kat_time, unit = unit),
+      kat_rounded = round_date(kat_time, unit = unit),
+      kat_ceiling = ceiling_date(kat_time, unit = unit)
+    ) %>%
+    collect()
+
+  compare_local_times <- function(time1, time2) {
+    all(year(time1) == year(time1) &
+          month(time1) == month(time2) &
+          day(time1) == day(time2) &
+          hour(time1) == hour(time2) &
+          minute(time1) == minute(time2) &
+          second(time1) == second(time1))
+  }
+
+  base <- shifted_times$utc_rounded
+  expect_true(compare_local_times(shifted_times$syd_rounded, base))
+  expect_true(compare_local_times(shifted_times$adl_rounded, base))
+  expect_true(compare_local_times(shifted_times$mar_rounded, base))
+  expect_true(compare_local_times(shifted_times$kat_rounded, base))
+
+  base <- shifted_times$utc_floored
+  expect_true(compare_local_times(shifted_times$syd_floored, base))
+  expect_true(compare_local_times(shifted_times$adl_floored, base))
+  expect_true(compare_local_times(shifted_times$mar_floored, base))
+  expect_true(compare_local_times(shifted_times$kat_floored, base))
+
+  base <- shifted_times$utc_ceiling
+  expect_true(compare_local_times(shifted_times$syd_ceiling, base))
+  expect_true(compare_local_times(shifted_times$adl_ceiling, base))
+  expect_true(compare_local_times(shifted_times$mar_ceiling, base))
+  expect_true(compare_local_times(shifted_times$kat_ceiling, base))
+}
+
+test_that("timestamp rounding takes place in local time", {
+
+  tz_times %>% check_timezone_rounding_vs_lubridate(".001 second")
+  tz_times %>% check_timezone_rounding_vs_lubridate("second")
+  tz_times %>% check_timezone_rounding_vs_lubridate("minute")
+  tz_times %>% check_timezone_rounding_vs_lubridate("hour")
+  tz_times %>% check_timezone_rounding_vs_lubridate("day")
+  tz_times %>% check_timezone_rounding_vs_lubridate("week")
+  tz_times %>% check_timezone_rounding_vs_lubridate("month")
+  tz_times %>% check_timezone_rounding_vs_lubridate("quarter")
+  tz_times %>% check_timezone_rounding_vs_lubridate("year")
+
+  tz_times %>% check_timezone_rounding_for_consistency("second")
+  tz_times %>% check_timezone_rounding_for_consistency("minute")
+  tz_times %>% check_timezone_rounding_for_consistency("hour")
+  tz_times %>% check_timezone_rounding_for_consistency("day")
+  tz_times %>% check_timezone_rounding_for_consistency("week")
+  tz_times %>% check_timezone_rounding_for_consistency("month")
+  tz_times %>% check_timezone_rounding_for_consistency("quarter")
+  tz_times %>% check_timezone_rounding_for_consistency("year")
+
+  tz_times %>% check_timezone_rounding_for_consistency("7 seconds")
+  tz_times %>% check_timezone_rounding_for_consistency("7 minutes")
+  tz_times %>% check_timezone_rounding_for_consistency("7 hours")
+  tz_times %>% check_timezone_rounding_for_consistency("7 months")
+  tz_times %>% check_timezone_rounding_for_consistency("7 years")
+
+  tz_times %>% check_timezone_rounding_for_consistency("13 seconds")
+  tz_times %>% check_timezone_rounding_for_consistency("13 minutes")
+  tz_times %>% check_timezone_rounding_for_consistency("13 hours")
+  tz_times %>% check_timezone_rounding_for_consistency("13 months")
+  tz_times %>% check_timezone_rounding_for_consistency("13 years")
+
+})

Reply via email to