Zhuo Jia Dai created ARROW-7809: ----------------------------------- Summary: R vignette does not run on Win 10 nor ubuntu Key: ARROW-7809 URL: https://issues.apache.org/jira/browse/ARROW-7809 Project: Apache Arrow Issue Type: Bug Reporter: Zhuo Jia Dai
On Win10 {code:java} bucket <- "https://ursa-labs-taxi-data.s3.us-east-2.amazonaws.com" dir.create("nyc-taxi") for (year in 2018:2018) { if(!dir.exists(glue::glue("nyc-taxi/ {year}/"))) { dir.create(glue::glue("nyc-taxi/{year} /")) } for (month in 1:12) { if (month < 10) { month <- paste0("0", month) } if(!dir.exists(glue::glue("nyc-taxi/ {year}/{month}"))) { dir.create(glue::glue("nyc-taxi/{year} / {month} ")) } try(download.file( paste(bucket, year, month, "data.parquet", sep = "/"), file.path("nyc-taxi", year, month, "data.parquet") )) } } aa = arrow::open_dataset("nyc-taxi", partitioning = c("year", "month")) {code} gives error {code:java} Error in dataset___FSSFactory__Make3(filesystem, selector, format, partitioning) : IOError: Could not open parquet input source 'nyc-taxi/2018/01/data.parquet': Couldn't deserialize thrift: TProtocolException: Invalid data In addition: Warning message: {code} On Ubuntu, running {code:java} library(dplyr)ds = arrow::open_dataset("nyc-taxi", partitioning = c("year", "month")) system.time(ds %>% filter(total_amount > 100, year == 2015) %>% select(tip_amount, total_amount, passenger_count) %>% group_by(passenger_count) %>% collect() %>% summarize( tip_pct = median(100 * tip_amount / total_amount), n = n() ) %>% print()) {code} gives the following segfault {code:java} *** caught segfault *** address (nil), cause 'memory not mapped'Traceback: 1: Table__to_dataframe(x, use_threads = option_use_threads()) 2: as.data.frame.Table(scanner_builder$Finish()$ToTable()) 3: as.data.frame(scanner_builder$Finish()$ToTable()) 4: collect.arrow_dplyr_query(.) 5: collect(.) 6: function_list[[i]](value) 7: freduce(value, `_function_list`) 8: `_fseq`(`_lhs`) 9: eval(quote(`_fseq`(`_lhs`)), env, env) 10: eval(quote(`_fseq`(`_lhs`)), env, env) 11: withVisible(eval(quote(`_fseq`(`_lhs`)), env, env)) 12: ds %>% filter(total_amount > 100, year == 2015) %>% select(tip_amount, total_amount, passenger_count) %>% group_by(passenger_count) %>% collect() %>% summarize(tip_pct = median(100 * tip_amount/total_amount), n = n()) %>% print() 13: system.time(ds %>% filter(total_amount > 100, year == 2015) %>% select(tip_amount, total_amount, passenger_count) %>% group_by(passenger_count) %>% collect() %>% summarize(tip_pct = median(100 * tip_amount/total_amount), n = n()) %>% print()) {code} -- This message was sent by Atlassian Jira (v8.3.4#803005)