Zhuo Jia Dai created ARROW-7809:
-----------------------------------
Summary: R vignette does not run on Win 10 nor ubuntu
Key: ARROW-7809
URL: https://issues.apache.org/jira/browse/ARROW-7809
Project: Apache Arrow
Issue Type: Bug
Reporter: Zhuo Jia Dai
On Win10
{code:java}
bucket <- "https://ursa-labs-taxi-data.s3.us-east-2.amazonaws.com"
dir.create("nyc-taxi")
for (year in 2018:2018) {
if(!dir.exists(glue::glue("nyc-taxi/
{year}/"))) {
dir.create(glue::glue("nyc-taxi/{year}
/"))
}
for (month in 1:12) {
if (month < 10)
{ month <- paste0("0", month) }
if(!dir.exists(glue::glue("nyc-taxi/
{year}/{month}"))) {
dir.create(glue::glue("nyc-taxi/{year}
/
{month}
"))
}
try(download.file(
paste(bucket, year, month, "data.parquet", sep = "/"),
file.path("nyc-taxi", year, month, "data.parquet")
))
}
}
aa = arrow::open_dataset("nyc-taxi", partitioning = c("year", "month"))
{code}
gives error
{code:java}
Error in dataset___FSSFactory__Make3(filesystem, selector, format,
partitioning) :
IOError: Could not open parquet input source 'nyc-taxi/2018/01/data.parquet':
Couldn't deserialize thrift: TProtocolException: Invalid data
In addition: Warning message:
{code}
On Ubuntu, running
{code:java}
library(dplyr)ds = arrow::open_dataset("nyc-taxi", partitioning = c("year",
"month"))
system.time(ds %>%
filter(total_amount > 100, year == 2015) %>%
select(tip_amount, total_amount, passenger_count) %>%
group_by(passenger_count) %>%
collect() %>%
summarize(
tip_pct = median(100 * tip_amount / total_amount),
n = n()
) %>%
print())
{code}
gives the following segfault
{code:java}
*** caught segfault ***
address (nil), cause 'memory not mapped'Traceback:
1: Table__to_dataframe(x, use_threads = option_use_threads())
2: as.data.frame.Table(scanner_builder$Finish()$ToTable())
3: as.data.frame(scanner_builder$Finish()$ToTable())
4: collect.arrow_dplyr_query(.)
5: collect(.)
6: function_list[[i]](value)
7: freduce(value, `_function_list`)
8: `_fseq`(`_lhs`)
9: eval(quote(`_fseq`(`_lhs`)), env, env)
10: eval(quote(`_fseq`(`_lhs`)), env, env)
11: withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
12: ds %>% filter(total_amount > 100, year == 2015) %>% select(tip_amount,
total_amount, passenger_count) %>% group_by(passenger_count) %>% collect()
%>% summarize(tip_pct = median(100 * tip_amount/total_amount), n = n()) %>%
print()
13: system.time(ds %>% filter(total_amount > 100, year == 2015) %>%
select(tip_amount, total_amount, passenger_count) %>% group_by(passenger_count)
%>% collect() %>% summarize(tip_pct = median(100 *
tip_amount/total_amount), n = n()) %>% print())
{code}
--
This message was sent by Atlassian Jira
(v8.3.4#803005)