This is an automated email from the ASF dual-hosted git repository.
thisisnic pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 2a526c1e62 GH-35806: [R] Improve error message for null type inference
with sparse CSV data (#49338)
2a526c1e62 is described below
commit 2a526c1e623df0ce8b9b76b03d4d9c617d21fda1
Author: Nic Crane <[email protected]>
AuthorDate: Fri Mar 27 14:59:23 2026 +0000
GH-35806: [R] Improve error message for null type inference with sparse CSV
data (#49338)
### Rationale for this change
When reading a CSV with sparse data (many missing values followed by actual
values), Arrow can infer a column type as `null` based on the first block of
data. When non-null values appear later, the error message incorrectly suggests
using `skip = 1` for header rows, which is misleading.
### What changes are included in this PR?
Adds a specific check for "conversion error to null" that provides a
helpful message explaining the cause (type inference from sparse data) and the
solution (change the block size to use for inference).
### Are these changes tested?
Yes, added a test in `test-dataset-csv.R`.
### Are there any user-facing changes?
Yes, improved error message when CSV type inference fails due to sparse
data.
---
This PR was authored by Claude (Opus 4.5) and reviewed by @ thisisnic.
🤖 Generated with [Claude Code](https://claude.ai/code)
* GitHub Issue: #35806
Lead-authored-by: Nic Crane <[email protected]>
Co-authored-by: Jonathan Keane <[email protected]>
Signed-off-by: Nic Crane <[email protected]>
---
r/R/util.R | 18 ++++++++++++++++++
r/tests/testthat/test-dataset-csv.R | 18 ++++++++++++++++++
2 files changed, 36 insertions(+)
diff --git a/r/R/util.R b/r/R/util.R
index c63e1ee545..cc16bb7ccf 100644
--- a/r/R/util.R
+++ b/r/R/util.R
@@ -196,6 +196,20 @@ repeat_value_as_array <- function(object, n) {
}
handle_csv_read_error <- function(msg, call, schema) {
+ # Dataset collection passes empty schema() when no explicit
+ # CSV schema from the original call is available in this error path.
+ if (grepl("conversion error to null", msg) && is_empty_schema(schema)) {
+ msg <- c(
+ msg,
+ i = paste(
+ "If you have not specified the schema, this error may be due to the
column type being",
+ "inferred as `null` because the first block of data contained only
missing values.",
+ "See `?csv_read_options` for how to set a larger value or specify a
schema if you know the correct types."
+ )
+ )
+ abort(msg, call = call)
+ }
+
if (grepl("conversion error", msg) && inherits(schema, "Schema")) {
msg <- c(
msg,
@@ -290,3 +304,7 @@ col_type_from_compact <- function(x, y) {
abort(paste0("Unsupported compact specification: '", x, "' for column '",
y, "'"))
)
}
+
+is_empty_schema <- function(x) {
+ x == schema()
+}
diff --git a/r/tests/testthat/test-dataset-csv.R
b/r/tests/testthat/test-dataset-csv.R
index 749d1672ac..ce3f94e7f5 100644
--- a/r/tests/testthat/test-dataset-csv.R
+++ b/r/tests/testthat/test-dataset-csv.R
@@ -711,3 +711,21 @@ test_that("open_dataset() with `decimal_point` argument", {
tibble(x = 1.2, y = "c")
)
})
+
+test_that("more informative error when column inferred as null due to sparse
data (GH-35806)", {
+ tf <- tempfile()
+ on.exit(unlink(tf))
+
+ writeLines(c("x,y", paste0(1:100, ",")), tf)
+ write("101,foo", tf, append = TRUE)
+
+ expect_error(
+ open_dataset(
+ tf,
+ format = "csv",
+ read_options = csv_read_options(block_size = 100L)
+ ) |>
+ collect(),
+ "column type being inferred as"
+ )
+})