This is an automated email from the ASF dual-hosted git repository. kszucs pushed a commit to branch maint-1.0.x in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 51d0c450ba5a1e88365b1c58aad646ce80ce5db6 Author: Benjamin Kietzman <[email protected]> AuthorDate: Thu Aug 6 12:10:42 2020 -0400 ARROW-9573: [Python][Dataset] Provide read_table(ignore_prefixes=) Closes #7900 from bkietz/9573-expose-ignore_prefixes Authored-by: Benjamin Kietzman <[email protected]> Signed-off-by: Benjamin Kietzman <[email protected]> --- python/pyarrow/dataset.py | 2 +- python/pyarrow/parquet.py | 23 +++++++++++++++++++---- python/pyarrow/tests/test_parquet.py | 24 ++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index f4620d0..fd03aee 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -581,7 +581,7 @@ def dataset(source, schema=None, format=None, filesystem=None, files may be present in the Dataset (resulting in an error at scan time). ignore_prefixes : list, optional - Files matching one of those prefixes will be ignored by the + Files matching any of these prefixes will be ignored by the discovery process. This is matched to the basename of a path. By default this is ['.', '_']. Note that discovery happens only if a directory is passed as source. diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 59c79ac..b5be07f 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -1376,7 +1376,7 @@ class _ParquetDatasetV2: def __init__(self, path_or_paths, filesystem=None, filters=None, partitioning="hive", read_dictionary=None, buffer_size=None, - memory_map=False, **kwargs): + memory_map=False, ignore_prefixes=None, **kwargs): import pyarrow.dataset as ds import pyarrow.fs @@ -1430,7 +1430,8 @@ class _ParquetDatasetV2: self._dataset = ds.dataset(path_or_paths, filesystem=filesystem, format=parquet_format, - partitioning=partitioning) + partitioning=partitioning, + ignore_prefixes=ignore_prefixes) @property def schema(self): @@ -1521,6 +1522,12 @@ use_legacy_dataset : bool, default False for all columns and not only the partition keys, enables different partitioning schemes, etc. Set to False to use the legacy behaviour. +ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process if use_legacy_dataset=False. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. filesystem : FileSystem, default None If nothing passed, paths assumed to be found in the local on-disk filesystem. @@ -1544,7 +1551,8 @@ Returns def read_table(source, columns=None, use_threads=True, metadata=None, use_pandas_metadata=False, memory_map=False, read_dictionary=None, filesystem=None, filters=None, - buffer_size=0, partitioning="hive", use_legacy_dataset=False): + buffer_size=0, partitioning="hive", use_legacy_dataset=False, + ignore_prefixes=None): if not use_legacy_dataset: if metadata is not None: raise ValueError( @@ -1562,6 +1570,7 @@ def read_table(source, columns=None, use_threads=True, metadata=None, read_dictionary=read_dictionary, buffer_size=buffer_size, filters=filters, + ignore_prefixes=ignore_prefixes, ) except ImportError: # fall back on ParquetFile for simple cases when pyarrow.dataset @@ -1585,6 +1594,11 @@ def read_table(source, columns=None, use_threads=True, metadata=None, return dataset.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata) + if ignore_prefixes is not None: + raise ValueError( + "The 'ignore_prefixes' keyword is only supported when " + "use_legacy_dataset=False") + if _is_path_like(source): pf = ParquetDataset(source, metadata=metadata, memory_map=memory_map, read_dictionary=read_dictionary, @@ -1616,7 +1630,7 @@ switched to False.""", def read_pandas(source, columns=None, use_threads=True, memory_map=False, metadata=None, filters=None, buffer_size=0, - use_legacy_dataset=True): + use_legacy_dataset=True, ignore_prefixes=None): return read_table( source, columns=columns, @@ -1627,6 +1641,7 @@ def read_pandas(source, columns=None, use_threads=True, memory_map=False, buffer_size=buffer_size, use_pandas_metadata=True, use_legacy_dataset=use_legacy_dataset, + ignore_prefixes=ignore_prefixes ) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index a24b1b3..07af08f 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -2678,6 +2678,30 @@ def test_ignore_no_private_directories_path_list( _assert_dataset_paths(dataset, paths, use_legacy_dataset) [email protected] +@parametrize_legacy_dataset_fixed +def test_ignore_custom_prefixes(tempdir, use_legacy_dataset): + # ARROW-9573 - allow override of default ignore_prefixes + part = ["xxx"] * 3 + ["yyy"] * 3 + table = pa.table([ + pa.array(range(len(part))), + pa.array(part).dictionary_encode(), + ], names=['index', '_part']) + + pq.write_to_dataset(table, str(tempdir), partition_cols=['_part']) + + private_duplicate = tempdir / '_private_duplicate' + private_duplicate.mkdir() + pq.write_to_dataset(table, str(private_duplicate), + partition_cols=['_part']) + + read = pq.read_table( + tempdir, use_legacy_dataset=use_legacy_dataset, + ignore_prefixes=['_private']) + + assert read.equals(table) + + @parametrize_legacy_dataset_fixed def test_empty_directory(tempdir, use_legacy_dataset): # ARROW-5310 - reading empty directory
