Zhuo Jia Dai created ARROW-7809:
-----------------------------------

             Summary: R vignette does not run on Win 10 nor ubuntu
                 Key: ARROW-7809
                 URL: https://issues.apache.org/jira/browse/ARROW-7809
             Project: Apache Arrow
          Issue Type: Bug
            Reporter: Zhuo Jia Dai


On Win10
{code:java}
bucket <- "https://ursa-labs-taxi-data.s3.us-east-2.amazonaws.com";
 dir.create("nyc-taxi")
 for (year in 2018:2018) {
 if(!dir.exists(glue::glue("nyc-taxi/
{year}/"))) {
 dir.create(glue::glue("nyc-taxi/{year}
/"))
 }
for (month in 1:12) {
 if (month < 10)
{ month <- paste0("0", month) }
if(!dir.exists(glue::glue("nyc-taxi/
{year}/{month}"))) {
 dir.create(glue::glue("nyc-taxi/{year}
/
{month}
"))
 }
 try(download.file(
 paste(bucket, year, month, "data.parquet", sep = "/"),
 file.path("nyc-taxi", year, month, "data.parquet")
 ))
 }
 }
aa = arrow::open_dataset("nyc-taxi", partitioning = c("year", "month"))
{code}
gives error

 
{code:java}
Error in dataset___FSSFactory__Make3(filesystem, selector, format, 
partitioning) : 
  IOError: Could not open parquet input source 'nyc-taxi/2018/01/data.parquet': 
Couldn't deserialize thrift: TProtocolException: Invalid data
In addition: Warning message:
{code}
On Ubuntu, running
{code:java}
library(dplyr)ds = arrow::open_dataset("nyc-taxi", partitioning = c("year", 
"month"))
system.time(ds %>%
              filter(total_amount > 100, year == 2015) %>%
              select(tip_amount, total_amount, passenger_count) %>%
              group_by(passenger_count) %>%
              collect() %>%
              summarize(
                tip_pct = median(100 * tip_amount / total_amount),
                n = n()
              ) %>%
              print())

{code}
gives the following segfault
{code:java}
*** caught segfault ***
address (nil), cause 'memory not mapped'Traceback:
 1: Table__to_dataframe(x, use_threads = option_use_threads())
 2: as.data.frame.Table(scanner_builder$Finish()$ToTable())
 3: as.data.frame(scanner_builder$Finish()$ToTable())
 4: collect.arrow_dplyr_query(.)
 5: collect(.)
 6: function_list[[i]](value)
 7: freduce(value, `_function_list`)
 8: `_fseq`(`_lhs`)
 9: eval(quote(`_fseq`(`_lhs`)), env, env)
10: eval(quote(`_fseq`(`_lhs`)), env, env)
11: withVisible(eval(quote(`_fseq`(`_lhs`)), env, env))
12: ds %>% filter(total_amount > 100, year == 2015) %>% select(tip_amount,     
total_amount, passenger_count) %>% group_by(passenger_count) %>%     collect() 
%>% summarize(tip_pct = median(100 * tip_amount/total_amount),     n = n()) %>% 
print()
13: system.time(ds %>% filter(total_amount > 100, year == 2015) %>%     
select(tip_amount, total_amount, passenger_count) %>% group_by(passenger_count) 
%>%     collect() %>% summarize(tip_pct = median(100 * 
tip_amount/total_amount),     n = n()) %>% print())
{code}
 



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to