This is an automated email from the ASF dual-hosted git repository.
npr pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b0734e68d6 ARROW-14821: [R] Implement bindings for lubridate's
floor_date, ceiling_date, and round_date (#12154)
b0734e68d6 is described below
commit b0734e68d6f57fb22869df0d0aa2ae4dd75765dc
Author: Danielle Navarro <[email protected]>
AuthorDate: Fri Jul 22 08:31:02 2022 +1000
ARROW-14821: [R] Implement bindings for lubridate's floor_date,
ceiling_date, and round_date (#12154)
This patch provides dplyr bindings to for lubridate functions
`floor_date()`, `ceiling_date()`, and `round_date()`. This is my first attempt
at writing a patch, so my apologies if I've made any errors 🙂
### Supported functionality:
- Allows rounding to integer multiples of common time units (second,
minutes, days, etc)
- Mirrors the lubridate syntax allowing fractional units such as `unit =
.001 seconds` as an alias for `unit = 1 millisecond`
- Allows partial matching of date units based on first three characters:
e.g. `sec`, `second`, `seconds` all match `second`
- Mirrors lubridate in throwing errors when unit exceeds thresholds: 60
seconds, 60 minutes, 24 hours
~~### Major problems not yet addressed:~~
~~- Does not yet support the `week_start` argument, and implicitly fixes
`week_start = 4`~~
~~- Does not yet mirror lubridate handling of timezones~~
~~I'd prefer to fix these two issues before merging, but I'm uncertain how
best to handle them. Any advice would be appreciated!~~
~~### Minor things not yet addressed~~
~~- During rounding lubridate sometimes coerces Date objects to POSIXct.
This is not mirrored in the arrow bindings: date32 classes remain date32
classes. This introduces minor differences in rounding in some cases~~
~~- Does not yet support the `change_on_boundary` argument to
`ceiling_date()`. It's a small discrepancy, but it means that the default
behaviour of the arrow dplyr binding mirrors lubridate prior to v1.6.0~~
EDIT: issues now addressed!
Authored-by: Danielle Navarro <[email protected]>
Signed-off-by: Neal Richardson <[email protected]>
---
r/R/dplyr-datetime-helpers.R | 158 ++++++++
r/R/dplyr-funcs-datetime.R | 52 +++
r/tests/testthat/test-dplyr-funcs-datetime.R | 578 +++++++++++++++++++++++++++
3 files changed, 788 insertions(+)
diff --git a/r/R/dplyr-datetime-helpers.R b/r/R/dplyr-datetime-helpers.R
index 9199ce0dd5..efcc62ff4e 100644
--- a/r/R/dplyr-datetime-helpers.R
+++ b/r/R/dplyr-datetime-helpers.R
@@ -417,3 +417,161 @@ build_strptime_exprs <- function(x, formats) {
)
)
}
+
+# This function parses the "unit" argument to round_date, floor_date, and
+# ceiling_date. The input x is a single string like "second", "3 seconds",
+# "10 microseconds" or "2 secs" used to specify the size of the unit to
+# which the temporal data should be rounded. The matching rules implemented
+# are designed to mirror lubridate exactly: it extracts the numeric multiple
+# from the start of the string (presumed to be 1 if no number is present)
+# and selects the unit by looking at the first 3 characters only. This choice
+# ensures that "secs", "second", "microsecs" etc are all valid, but it is
+# very permissive and would interpret "mickeys" as microseconds. This
+# permissive implementation mirrors the corresponding implementation in
+# lubridate. The return value is a list with integer-valued components
+# "multiple" and "unit"
+parse_period_unit <- function(x) {
+ # the regexp matches against fractional units, but per lubridate
+ # supports integer multiples of a known unit only
+ match_info <- regexpr(
+ pattern = " *(?<multiple>[0-9.,]+)? *(?<unit>[^ \t\n]+)",
+ text = x[[1]],
+ perl = TRUE
+ )
+
+ capture_start <- attr(match_info, "capture.start")
+ capture_length <- attr(match_info, "capture.length")
+ capture_end <- capture_start + capture_length - 1L
+
+ str_unit <- substr(x, capture_start[[2]], capture_end[[2]])
+ str_multiple <- substr(x, capture_start[[1]], capture_end[[1]])
+
+ known_units <- c("nanosecond", "microsecond", "millisecond", "second",
+ "minute", "hour", "day", "week", "month", "quarter", "year")
+
+ # match the period unit
+ str_unit_start <- substr(str_unit, 1, 3)
+ unit <- as.integer(pmatch(str_unit_start, known_units)) - 1L
+
+ if (any(is.na(unit))) {
+ abort(
+ sprintf(
+ "Invalid period name: '%s'",
+ str_unit,
+ ". Known units are",
+ oxford_paste(known_units, "and")
+ )
+ )
+ }
+
+ # empty string in multiple interpreted as 1
+ if (capture_length[[1]] == 0) {
+ multiple <- 1L
+
+ # otherwise parse the multiple
+ } else {
+ multiple <- as.numeric(str_multiple)
+
+ # special cases: interpret fractions of 1 second as integer
+ # multiples of nanoseconds, microseconds, or milliseconds
+ # to mirror lubridate syntax
+ if (unit == 3L) {
+ if (multiple < 10^-6) {
+ unit <- 0L
+ multiple <- 10^9 * multiple
+ }
+ if (multiple < 10^-3) {
+ unit <- 1L
+ multiple <- 10^6 * multiple
+ }
+ if (multiple < 1) {
+ unit <- 2L
+ multiple <- 10^3 * multiple
+ }
+ }
+
+ multiple <- as.integer(multiple)
+ }
+
+ # more special cases: lubridate imposes sensible maximum
+ # values on the number of seconds, minutes and hours
+ if (unit == 3L && multiple > 60) {
+ abort("Rounding with second > 60 is not supported")
+ }
+ if (unit == 4L && multiple > 60) {
+ abort("Rounding with minute > 60 is not supported")
+ }
+ if (unit == 5L && multiple > 24) {
+ abort("Rounding with hour > 24 is not supported")
+ }
+
+ list(unit = unit, multiple = multiple)
+}
+
+# This function handles round/ceil/floor when unit is week. The fn argument
+# specifies which of the temporal rounding functions (round_date, etc) is to
+# be applied, x is the data argument to the rounding function, week_start is
+# an integer indicating which day of the week is the start date. The C++
+# library natively handles Sunday and Monday so in those cases we pass the
+# week_starts_monday option through. Other week_start values are handled here
+shift_temporal_to_week <- function(fn, x, week_start, options) {
+ if (week_start == 7) { # Sunday
+ options$week_starts_monday <- FALSE
+ return(Expression$create(fn, x, options = options))
+ }
+
+ if (week_start == 1) { # Monday
+ options$week_starts_monday <- TRUE
+ return(Expression$create(fn, x, options = options))
+ }
+
+ # other cases use offset-from-Monday: to ensure type-stable output there
+ # are two separate helpers, one to handle date32 input and the other to
+ # handle timestamps
+ options$week_starts_monday <- TRUE
+ offset <- as.integer(week_start) - 1
+
+ is_date32 <- inherits(x, "Date") ||
+ (inherits(x, "Expression") && x$type_id() == Type$DATE32)
+
+ if (is_date32) {
+ shifted_date <- shift_date32_to_week(fn, x, offset, options = options)
+ } else {
+ shifted_date <- shift_timestamp_to_week(fn, x, offset, options = options)
+ }
+
+ shifted_date
+}
+
+# timestamp input should remain timestamp
+shift_timestamp_to_week <- function(fn, x, offset, options) {
+ offset_seconds <- build_expr(
+ "cast",
+ Scalar$create(offset * 86400L, int64()),
+ options = cast_options(to_type = duration(unit = "s"))
+ )
+ shift_offset <- build_expr(fn, x - offset_seconds, options = options)
+
+ shift_offset + offset_seconds
+}
+
+# to avoid date32 types being cast to timestamp during the temporal
+# arithmetic, the offset logic needs to use the count in days and
+# use integer arithmetic: this feels inelegant, but it ensures that
+# temporal rounding functions remain type stable
+shift_date32_to_week <- function(fn, x, offset, options) {
+ # offset the date
+ offset <- Expression$scalar(Scalar$create(offset, int32()))
+ x_int <- build_expr("cast", x, options = cast_options(to_type = int32()))
+ x_int_offset <- x_int - offset
+ x_offset <- build_expr("cast", x_int_offset, options = cast_options(to_type
= date32()))
+
+ # apply round/floor/ceil
+ shift_offset <- build_expr(fn, x_offset, options = options)
+
+ # undo offset and return
+ shift_int_offset <- build_expr("cast", shift_offset, options =
cast_options(to_type = int32()))
+ shift_int <- shift_int_offset + offset
+
+ build_expr("cast", shift_int, options = cast_options(to_type = date32()))
+}
diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R
index 1db6c647d5..fd705e4578 100644
--- a/r/R/dplyr-funcs-datetime.R
+++ b/r/R/dplyr-funcs-datetime.R
@@ -25,6 +25,7 @@ register_bindings_datetime <- function() {
register_bindings_duration_constructor()
register_bindings_duration_helpers()
register_bindings_datetime_parsers()
+ register_bindings_datetime_rounding()
}
register_bindings_datetime_utility <- function() {
@@ -637,4 +638,55 @@ register_bindings_datetime_parsers <- function() {
build_expr("assume_timezone", coalesce_output, options = list(timezone =
tz))
})
+
+}
+
+register_bindings_datetime_rounding <- function() {
+ register_binding(
+ "round_date",
+ function(x,
+ unit = "second",
+ week_start = getOption("lubridate.week.start", 7)) {
+
+ opts <- parse_period_unit(unit)
+ if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
+ return(shift_temporal_to_week("round_temporal", x, week_start, options =
opts))
+ }
+
+ Expression$create("round_temporal", x, options = opts)
+ })
+
+ register_binding(
+ "floor_date",
+ function(x,
+ unit = "second",
+ week_start = getOption("lubridate.week.start", 7)) {
+
+ opts <- parse_period_unit(unit)
+ if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
+ return(shift_temporal_to_week("floor_temporal", x, week_start, options =
opts))
+ }
+
+ Expression$create("floor_temporal", x, options = opts)
+ })
+
+ register_binding(
+ "ceiling_date",
+ function(x,
+ unit = "second",
+ change_on_boundary = NULL,
+ week_start = getOption("lubridate.week.start", 7)) {
+ opts <- parse_period_unit(unit)
+ if (is.null(change_on_boundary)) {
+ change_on_boundary <- ifelse(call_binding("is.Date", x), TRUE, FALSE)
+ }
+ opts$ceil_is_strictly_greater <- change_on_boundary
+
+ if (opts$unit == 7L) { # weeks (unit = 7L) need to accommodate week_start
+ return(shift_temporal_to_week("ceil_temporal", x, week_start, options =
opts))
+ }
+
+ Expression$create("ceil_temporal", x, options = opts)
+ })
+
}
diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R
b/r/tests/testthat/test-dplyr-funcs-datetime.R
index 6caf061fc8..1c6b73d7af 100644
--- a/r/tests/testthat/test-dplyr-funcs-datetime.R
+++ b/r/tests/testthat/test-dplyr-funcs-datetime.R
@@ -2515,3 +2515,581 @@ test_that("build_formats() and
build_format_from_order()", {
)
)
})
+
+
+
+# tests for datetime rounding ---------------------------------------------
+
+# an "easy" date to avoid conflating tests of different things (i.e., it's
+# UTC time, and not one of the edge cases on or extremely close to the
+# rounding boundaty)
+easy_date <- as.POSIXct("2022-10-11 12:00:00", tz = "UTC")
+easy_df <- tibble::tibble(datetime = easy_date)
+
+# dates near month boundaries over the course of 1 year
+month_boundaries <- c(
+ "2021-01-01 00:01:00", "2021-02-01 00:01:00", "2021-03-01 00:01:00",
+ "2021-04-01 00:01:00", "2021-05-01 00:01:00", "2021-06-01 00:01:00",
+ "2021-07-01 00:01:00", "2021-08-01 00:01:00", "2021-09-01 00:01:00",
+ "2021-10-01 00:01:00", "2021-11-01 00:01:00", "2021-12-01 00:01:00",
+ "2021-01-31 23:59:00", "2021-02-28 23:59:00", "2021-03-31 23:59:00",
+ "2021-04-30 23:59:00", "2021-05-31 23:59:00", "2021-06-30 23:59:00",
+ "2021-07-31 23:59:00", "2021-08-31 23:59:00", "2021-09-30 23:59:00",
+ "2021-10-31 23:59:00", "2021-11-30 23:59:00", "2021-12-31 23:59:00"
+)
+year_of_dates <- tibble::tibble(
+ datetime = as.POSIXct(month_boundaries, tz = "UTC"),
+ date = as.Date(datetime)
+)
+
+# test case used to check we catch week boundaries for all week_start values
+fortnight <- tibble::tibble(
+ date = seq(
+ from = as.Date("2022-04-04"),
+ to = as.Date("2022-04-17"),
+ by = "day"
+ ),
+ datetime = as.POSIXct(date)
+)
+
+# test case to check we catch interval lower boundaries for ceiling_date
+boundary_times <- tibble::tibble(
+ datetime = as.POSIXct(strptime(c(
+ "2022-05-10 00:00:00", # boundary for week when week_start = 7 (Sunday)
+ "2022-05-11 00:00:00", # boundary for week when week_start = 1 (Monday)
+ "2022-05-12 00:00:00", # boundary for week when week_start = 2 (Tuesday)
+ "2022-03-10 00:00:00", # boundary for day, hour, minute, second,
millisecond
+ "2022-03-10 00:00:01", # boundary for second, millisecond
+ "2022-03-10 00:01:00", # boundary for second, millisecond, minute
+ "2022-03-10 01:00:00", # boundary for second, millisecond, minute, hour
+ "2022-01-01 00:00:00" # boundary for year
+ ), tz = "UTC", format = "%F %T")),
+ date = as.Date(datetime)
+)
+
+# test case to check rounding takes place in local time
+datestrings <- c(
+ "1970-01-01 00:00:59.123456789",
+ "2000-02-29 23:23:23.999999999",
+ "1899-01-01 00:59:20.001001001",
+ "2033-05-18 03:33:20.000000000",
+ "2020-01-01 01:05:05.001",
+ "2019-12-31 02:10:10.002",
+ "2019-12-30 03:15:15.003",
+ "2009-12-31 04:20:20.004132",
+ "2010-01-01 05:25:25.005321",
+ "2010-01-03 06:30:30.006163",
+ "2010-01-04 07:35:35",
+ "2006-01-01 08:40:40",
+ "2005-12-31 09:45:45",
+ "2008-12-28 00:00:00",
+ "2008-12-29 00:00:00",
+ "2012-01-01 01:02:03"
+)
+tz_times <- tibble::tibble(
+ utc_time = as.POSIXct(datestrings, tz = "UTC"),
+ syd_time = as.POSIXct(datestrings, tz = "Australia/Sydney"), # UTC +10
(UTC +11 with DST)
+ adl_time = as.POSIXct(datestrings, tz = "Australia/Adelaide"), # UTC +9:30
(UTC +10:30 with DST)
+ mar_time = as.POSIXct(datestrings, tz = "Pacific/Marquesas"), # UTC -9:30
(no DST)
+ kat_time = as.POSIXct(datestrings, tz = "Asia/Kathmandu") # UTC +5:45
(no DST)
+)
+
+test_that("timestamp round/floor/ceiling works for a minimal test", {
+
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ round_datetime = round_date(datetime),
+ floor_datetime = floor_date(datetime),
+ ceiling_datetime = ceiling_date(datetime, change_on_boundary = FALSE)
+ ) %>%
+ collect(),
+ test_df
+ )
+})
+
+test_that("timestamp round/floor/ceiling accepts period unit abbreviation", {
+
+ # test helper to ensure standard abbreviations of period names
+ # are understood by arrow and mirror the lubridate behaviour
+ check_period_abbreviation <- function(unit, synonyms) {
+
+ # check arrow against lubridate
+ compare_dplyr_binding(
+ .input %>%
+ mutate(out_1 = round_date(datetime, unit)) %>%
+ collect(),
+ easy_df
+ )
+
+ # check synonyms
+ base <- call_binding("round_date", Expression$scalar(easy_date), unit)
+ for (syn in synonyms) {
+ expect_equal(
+ call_binding("round_date", Expression$scalar(easy_date), syn),
+ base
+ )
+ }
+ }
+
+ check_period_abbreviation("minute", synonyms = c("minutes", "min", "mins"))
+ check_period_abbreviation("second", synonyms = c("seconds", "sec", "secs"))
+ check_period_abbreviation("month", synonyms = c("months", "mon", "mons"))
+})
+
+test_that("temporal round/floor/ceiling accepts periods with multiple units", {
+
+ check_multiple_unit_period <- function(unit, multiplier) {
+ unit_string <- paste(multiplier, unit)
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ round_datetime = round_date(datetime, unit_string),
+ floor_datetime = floor_date(datetime, unit_string),
+ ceiling_datetime = ceiling_date(datetime, unit_string)
+ ) %>%
+ collect(),
+ easy_df
+ )
+ }
+
+ for (multiplier in c(1, 2, 10)) {
+ for (unit in c("second", "minute", "day", "year")) {
+ check_multiple_unit_period(unit, multiplier)
+ }
+ }
+})
+
+# Test helper functions for checking equivalence of outputs regardless of
+# the unit specified. The lubridate_unit argument allows for cases where
+# arrow supports a unit name (e.g., nanosecond) that lubridate doesn't. Also
+# note that in the check_date_rounding helper the lubridate output is coerced
+# to ensure type stable output (arrow output should be type stable without
this)
+
+check_date_rounding <- function(data, unit, lubridate_unit = unit, ...) {
+ expect_equal(
+ data %>%
+ arrow_table() %>%
+ mutate(
+ date_rounded = round_date(date, unit),
+ date_floored = floor_date(date, unit),
+ date_ceiling = ceiling_date(date, unit)
+ ) %>%
+ collect(),
+ data %>%
+ mutate(
+ date_rounded = as.Date(round_date(date, lubridate_unit)),
+ date_floored = as.Date(floor_date(date, lubridate_unit)),
+ date_ceiling = as.Date(ceiling_date(date, lubridate_unit))
+ ),
+ ...
+ )
+}
+
+check_timestamp_rounding <- function(data, unit, lubridate_unit = unit, ...) {
+
+ expect_equal(
+ data %>%
+ arrow_table() %>%
+ mutate(
+ datetime_rounded = round_date(datetime, unit),
+ datetime_floored = floor_date(datetime, unit),
+ datetime_ceiling = ceiling_date(datetime, unit)
+ ) %>%
+ collect(),
+ data %>%
+ mutate(
+ datetime_rounded = round_date(datetime, lubridate_unit),
+ datetime_floored = floor_date(datetime, lubridate_unit),
+ datetime_ceiling = ceiling_date(datetime, lubridate_unit)
+ ),
+ ...
+ )
+}
+
+test_that("date round/floor/ceil works for units of 1 day or less", {
+
+ test_df %>% check_date_rounding("1 millisecond", lubridate_unit = ".001
second")
+ test_df %>% check_date_rounding("1 day")
+ test_df %>% check_date_rounding("1 second")
+ test_df %>% check_date_rounding("1 hour")
+
+})
+
+test_that("timestamp round/floor/ceil works for units of 1 day or less", {
+
+ test_df %>% check_timestamp_rounding("second")
+ test_df %>% check_timestamp_rounding("minute")
+ test_df %>% check_timestamp_rounding("hour")
+ test_df %>% check_timestamp_rounding("day")
+
+ test_df %>% check_timestamp_rounding(".01 second")
+ test_df %>% check_timestamp_rounding(".001 second")
+ test_df %>% check_timestamp_rounding(".00001 second")
+
+ test_df %>% check_timestamp_rounding("1 millisecond", lubridate_unit = ".001
second")
+ test_df %>% check_timestamp_rounding("1 microsecond", lubridate_unit =
".000001 second")
+ test_df %>% check_timestamp_rounding("1 nanosecond", lubridate_unit =
".000000001 second")
+
+})
+
+test_that("timestamp round/floor/ceil works for units: month/quarter/year", {
+
+ year_of_dates %>% check_timestamp_rounding("month", ignore_attr = TRUE)
+ year_of_dates %>% check_timestamp_rounding("quarter", ignore_attr = TRUE)
+ year_of_dates %>% check_timestamp_rounding("year", ignore_attr = TRUE)
+
+})
+
+# check helper invoked when we need to avoid the lubridate rounding bug
+check_date_rounding_1051_bypass <- function(data, unit, ignore_attr = TRUE,
...) {
+
+ # directly compare arrow to lubridate for floor and ceiling
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ date_floored = floor_date(date, unit),
+ date_ceiling = ceiling_date(date, unit)
+ ) %>%
+ collect(),
+ data,
+ ignore_attr = ignore_attr,
+ ...
+ )
+
+ # The rounding tests for dates is run against Arrow timestamp behaviour
+ # because of a lubridate bug specific to Date objects with week and
+ # higher-unit rounding (see lubridate issue 1051)
+ # https://github.com/tidyverse/lubridate/issues/1051
+ out <- data %>%
+ arrow_table() %>%
+ mutate(
+ out_date = date %>% round_date(unit), # Date
+ out_time = datetime %>% round_date(unit) # POSIXct
+ ) %>%
+ collect()
+
+ expect_equal(
+ out$out_date,
+ as.Date(out$out_time)
+ )
+}
+
+test_that("date round/floor/ceil works for units: month/quarter/year", {
+
+ # these test cases are affected by lubridate issue 1051 so we bypass
+ # lubridate::round_date() for Date objects with large rounding units
+ # https://github.com/tidyverse/lubridate/issues/1051
+
+ check_date_rounding_1051_bypass(year_of_dates, "month", ignore_attr = TRUE)
+ check_date_rounding_1051_bypass(year_of_dates, "quarter", ignore_attr = TRUE)
+ check_date_rounding_1051_bypass(year_of_dates, "year", ignore_attr = TRUE)
+
+})
+
+check_date_week_rounding <- function(data, week_start, ignore_attr = TRUE,
...) {
+ expect_equal(
+ data %>%
+ arrow_table() %>%
+ mutate(
+ date_rounded = round_date(date, unit),
+ date_floored = floor_date(date, unit),
+ date_ceiling = ceiling_date(date, unit)
+ ) %>%
+ collect(),
+ data %>%
+ mutate(
+ date_rounded = as.Date(round_date(date, lubridate_unit)),
+ date_floored = as.Date(floor_date(date, lubridate_unit)),
+ date_ceiling = as.Date(ceiling_date(date, lubridate_unit))
+ ),
+ ignore_attr = ignore_attr,
+ ...
+ )
+}
+
+check_timestamp_week_rounding <- function(data, week_start, ignore_attr =
TRUE, ...) {
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ datetime_rounded = round_date(datetime, "week", week_start =
week_start),
+ datetime_floored = floor_date(datetime, "week", week_start =
week_start),
+ datetime_ceiling = ceiling_date(datetime, "week", week_start =
week_start)
+ ) %>%
+ collect(),
+ data,
+ ignore_attr = ignore_attr,
+ ...
+ )
+}
+
+test_that("timestamp round/floor/ceil works for week units (standard
week_start)", {
+
+ fortnight %>% check_timestamp_week_rounding(week_start = 1) # Monday
+ fortnight %>% check_timestamp_week_rounding(week_start = 7) # Sunday
+
+})
+
+test_that("timestamp round/floor/ceil works for week units (non-standard
week_start)", {
+
+ fortnight %>% check_timestamp_week_rounding(week_start = 2) # Tuesday
+ fortnight %>% check_timestamp_week_rounding(week_start = 3) # Wednesday
+ fortnight %>% check_timestamp_week_rounding(week_start = 4) # Thursday
+ fortnight %>% check_timestamp_week_rounding(week_start = 5) # Friday
+ fortnight %>% check_timestamp_week_rounding(week_start = 6) # Saturday
+
+})
+
+check_date_week_rounding <- function(data, week_start, ignore_attr = TRUE,
...) {
+
+ # directly compare arrow to lubridate for floor and ceiling
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ date_floored = floor_date(date, "week", week_start = week_start),
+ date_ceiling = ceiling_date(date, "week", week_start = week_start)
+ ) %>%
+ collect(),
+ data,
+ ignore_attr = ignore_attr,
+ ...
+ )
+
+ # use the bypass method to avoid the lubridate-1051 bug for week units
+ # https://github.com/tidyverse/lubridate/issues/1051
+ out <- data %>%
+ arrow_table() %>%
+ mutate(
+ out_date = date %>% round_date("week", week_start = week_start), # Date
+ out_time = datetime %>% round_date("week", week_start = week_start) #
POSIXct
+ ) %>%
+ collect()
+
+ expect_equal(
+ out$out_date,
+ as.Date(out$out_time)
+ )
+}
+
+test_that("date round/floor/ceil works for week units (standard week_start)", {
+
+ check_date_week_rounding(fortnight, week_start = 1) # Monday
+ check_date_week_rounding(fortnight, week_start = 7) # Sunday
+
+})
+
+test_that("date round/floor/ceil works for week units (non-standard
week_start)", {
+
+ check_date_week_rounding(fortnight, week_start = 2) # Tuesday
+ check_date_week_rounding(fortnight, week_start = 3) # Wednesday
+ check_date_week_rounding(fortnight, week_start = 4) # Thursday
+ check_date_week_rounding(fortnight, week_start = 5) # Friday
+ check_date_week_rounding(fortnight, week_start = 6) # Saturday
+
+})
+
+# Test helper used to check that the change_on_boundary argument to
+# ceiling_date behaves identically to the lubridate version. It takes
+# unit as an argument to run tests separately for different rounding units
+check_boundary_with_unit <- function(unit, ...) {
+
+ # timestamps
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ cob_null = ceiling_date(datetime, unit, change_on_boundary = NULL),
+ cob_true = ceiling_date(datetime, unit, change_on_boundary = TRUE),
+ cob_false = ceiling_date(datetime, unit, change_on_boundary = FALSE)
+ ) %>%
+ collect(),
+ boundary_times,
+ ...
+ )
+
+ # dates
+ expect_equal(
+ boundary_times %>%
+ arrow_table() %>%
+ mutate(
+ cob_null = ceiling_date(date, unit, change_on_boundary = NULL),
+ cob_true = ceiling_date(date, unit, change_on_boundary = TRUE),
+ cob_false = ceiling_date(date, unit, change_on_boundary = FALSE)
+ ) %>%
+ collect(),
+ boundary_times %>%
+ mutate(
+ cob_null = as.Date(ceiling_date(date, unit, change_on_boundary =
NULL)),
+ cob_true = as.Date(ceiling_date(date, unit, change_on_boundary =
TRUE)),
+ cob_false = as.Date(ceiling_date(date, unit, change_on_boundary =
FALSE))
+ ),
+ ...
+ )
+
+
+}
+
+test_that("ceiling_date() applies change_on_boundary correctly", {
+ check_boundary_with_unit(".001 second")
+ check_boundary_with_unit("second")
+ check_boundary_with_unit("minute", tolerance = .001) # floating point issue?
+ check_boundary_with_unit("hour")
+ check_boundary_with_unit("day")
+})
+
+# In lubridate, an error is thrown when 60 sec/60 min/24 hour thresholds are
+# exceeded. Checks that arrow mimics this behaviour and throws an identically
+# worded error message
+test_that("temporal round/floor/ceil period unit maxima are enforced", {
+
+ expect_error(
+ call_binding("round_date", Expression$scalar(Sys.time()), "61 seconds"),
+ "Rounding with second > 60 is not supported"
+ )
+ expect_error(
+ call_binding("round_date", Expression$scalar(Sys.time()), "61 minutes"),
+ "Rounding with minute > 60 is not supported"
+ )
+ expect_error(
+ call_binding("round_date", Expression$scalar(Sys.time()), "25 hours"),
+ "Rounding with hour > 24 is not supported"
+ )
+ expect_error(
+ call_binding("round_date", Expression$scalar(Sys.Date()), "25 hours"),
+ "Rounding with hour > 24 is not supported"
+ )
+
+})
+
+# one method to test that temporal rounding takes place in local time is to
+# use lubridate as a ground truth and compare arrow results to lubridate
+# results. this test helper runs that test, skipping cases where lubridate
+# produces incorrect answers
+check_timezone_rounding_vs_lubridate <- function(data, unit) {
+
+ # esoteric lubridate bug: on windows only (not ubuntu), lubridate returns
+ # incorrect ceiling/floor for timezoned POSIXct times (syd, adl, kat zones,
+ # but not mar) but not utc, and not for round, and only for these two
+ # timestamps where high-precision timing is relevant to the outcome
+ if (unit %in% c(".001 second", "second", "minute")) {
+ if (tolower(Sys.info()[["sysname"]]) == "windows") {
+ data <- data[-c(1, 3), ]
+ }
+ }
+
+ # external validity check: compare lubridate to arrow
+ compare_dplyr_binding(
+ .input %>%
+ mutate(
+ utc_floored = floor_date(utc_time, unit = unit),
+ utc_rounded = round_date(utc_time, unit = unit),
+ utc_ceiling = ceiling_date(utc_time, unit = unit),
+ syd_floored = floor_date(syd_time, unit = unit),
+ syd_rounded = round_date(syd_time, unit = unit),
+ syd_ceiling = ceiling_date(syd_time, unit = unit),
+ adl_floored = floor_date(adl_time, unit = unit),
+ adl_rounded = round_date(adl_time, unit = unit),
+ adl_ceiling = ceiling_date(adl_time, unit = unit),
+ mar_floored = floor_date(mar_time, unit = unit),
+ mar_rounded = round_date(mar_time, unit = unit),
+ mar_ceiling = ceiling_date(mar_time, unit = unit),
+ kat_floored = floor_date(kat_time, unit = unit),
+ kat_rounded = round_date(kat_time, unit = unit),
+ kat_ceiling = ceiling_date(kat_time, unit = unit)
+ ) %>%
+ collect(),
+ data
+ )
+
+}
+
+# another method to check that temporal rounding takes place in local
+# time is to test the internal consistency of the YMD HMS values returned
+# by temporal rounding functions: these should be the same regardless of
+# timezone and should always be identical to the equivalent result calculated
+# for UTC test. this test isn't useful for subsecond resolution but avoids
+# dependency on lubridate
+check_timezone_rounding_for_consistency <- function(data, unit) {
+
+ shifted_times <- data %>%
+ arrow_table() %>%
+ mutate(
+ utc_floored = floor_date(utc_time, unit = unit),
+ utc_rounded = round_date(utc_time, unit = unit),
+ utc_ceiling = ceiling_date(utc_time, unit = unit),
+ syd_floored = floor_date(syd_time, unit = unit),
+ syd_rounded = round_date(syd_time, unit = unit),
+ syd_ceiling = ceiling_date(syd_time, unit = unit),
+ adl_floored = floor_date(adl_time, unit = unit),
+ adl_rounded = round_date(adl_time, unit = unit),
+ adl_ceiling = ceiling_date(adl_time, unit = unit),
+ mar_floored = floor_date(mar_time, unit = unit),
+ mar_rounded = round_date(mar_time, unit = unit),
+ mar_ceiling = ceiling_date(mar_time, unit = unit),
+ kat_floored = floor_date(kat_time, unit = unit),
+ kat_rounded = round_date(kat_time, unit = unit),
+ kat_ceiling = ceiling_date(kat_time, unit = unit)
+ ) %>%
+ collect()
+
+ compare_local_times <- function(time1, time2) {
+ all(year(time1) == year(time1) &
+ month(time1) == month(time2) &
+ day(time1) == day(time2) &
+ hour(time1) == hour(time2) &
+ minute(time1) == minute(time2) &
+ second(time1) == second(time1))
+ }
+
+ base <- shifted_times$utc_rounded
+ expect_true(compare_local_times(shifted_times$syd_rounded, base))
+ expect_true(compare_local_times(shifted_times$adl_rounded, base))
+ expect_true(compare_local_times(shifted_times$mar_rounded, base))
+ expect_true(compare_local_times(shifted_times$kat_rounded, base))
+
+ base <- shifted_times$utc_floored
+ expect_true(compare_local_times(shifted_times$syd_floored, base))
+ expect_true(compare_local_times(shifted_times$adl_floored, base))
+ expect_true(compare_local_times(shifted_times$mar_floored, base))
+ expect_true(compare_local_times(shifted_times$kat_floored, base))
+
+ base <- shifted_times$utc_ceiling
+ expect_true(compare_local_times(shifted_times$syd_ceiling, base))
+ expect_true(compare_local_times(shifted_times$adl_ceiling, base))
+ expect_true(compare_local_times(shifted_times$mar_ceiling, base))
+ expect_true(compare_local_times(shifted_times$kat_ceiling, base))
+}
+
+test_that("timestamp rounding takes place in local time", {
+
+ tz_times %>% check_timezone_rounding_vs_lubridate(".001 second")
+ tz_times %>% check_timezone_rounding_vs_lubridate("second")
+ tz_times %>% check_timezone_rounding_vs_lubridate("minute")
+ tz_times %>% check_timezone_rounding_vs_lubridate("hour")
+ tz_times %>% check_timezone_rounding_vs_lubridate("day")
+ tz_times %>% check_timezone_rounding_vs_lubridate("week")
+ tz_times %>% check_timezone_rounding_vs_lubridate("month")
+ tz_times %>% check_timezone_rounding_vs_lubridate("quarter")
+ tz_times %>% check_timezone_rounding_vs_lubridate("year")
+
+ tz_times %>% check_timezone_rounding_for_consistency("second")
+ tz_times %>% check_timezone_rounding_for_consistency("minute")
+ tz_times %>% check_timezone_rounding_for_consistency("hour")
+ tz_times %>% check_timezone_rounding_for_consistency("day")
+ tz_times %>% check_timezone_rounding_for_consistency("week")
+ tz_times %>% check_timezone_rounding_for_consistency("month")
+ tz_times %>% check_timezone_rounding_for_consistency("quarter")
+ tz_times %>% check_timezone_rounding_for_consistency("year")
+
+ tz_times %>% check_timezone_rounding_for_consistency("7 seconds")
+ tz_times %>% check_timezone_rounding_for_consistency("7 minutes")
+ tz_times %>% check_timezone_rounding_for_consistency("7 hours")
+ tz_times %>% check_timezone_rounding_for_consistency("7 months")
+ tz_times %>% check_timezone_rounding_for_consistency("7 years")
+
+ tz_times %>% check_timezone_rounding_for_consistency("13 seconds")
+ tz_times %>% check_timezone_rounding_for_consistency("13 minutes")
+ tz_times %>% check_timezone_rounding_for_consistency("13 hours")
+ tz_times %>% check_timezone_rounding_for_consistency("13 months")
+ tz_times %>% check_timezone_rounding_for_consistency("13 years")
+
+})