timsaucer commented on code in PR #981: URL: https://github.com/apache/datafusion-python/pull/981#discussion_r1907925883
########## python/datafusion/dataframe.py: ########## @@ -620,17 +679,34 @@ def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None def write_parquet( self, path: str | pathlib.Path, - compression: str = "uncompressed", + compression: str = Compression.ZSTD.value, Review Comment: It would be nice to have this take as the type for compression `std | Compression` and do a quick check and get the value passed a `Compression`. ########## python/datafusion/dataframe.py: ########## @@ -35,6 +35,65 @@ from datafusion._internal import DataFrame as DataFrameInternal from datafusion.expr import Expr, SortExpr, sort_or_default +from enum import Enum + + +# excerpt from deltalake +# https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 +class Compression(Enum): + """Enum representing the available compression types for Parquet files.""" + + UNCOMPRESSED = "uncompressed" + SNAPPY = "snappy" + GZIP = "gzip" + BROTLI = "brotli" + LZ4 = "lz4" + LZ0 = "lz0" + ZSTD = "zstd" + LZ4_RAW = "lz4_raw" + + @classmethod + def from_str(cls, value: str) -> "Compression": + """Convert a string to a Compression enum value. + + Args: + value (str): The string representation of the compression type. + + Returns: + Compression: The corresponding Compression enum value. + + Raises: + ValueError: If the string does not match any Compression enum value. + """ + try: + return cls(value.lower()) + except ValueError: + raise ValueError( + f"{value} is not a valid Compression. Valid values are: {[item.value for item in Compression]}" + ) + + def get_default_level(self) -> int: + """Get the default compression level for the compression type. + + Returns: + int: The default compression level. Review Comment: nit: `int` not required since it's in the hint ########## python/datafusion/dataframe.py: ########## @@ -620,17 +679,34 @@ def write_csv(self, path: str | pathlib.Path, with_header: bool = False) -> None def write_parquet( self, path: str | pathlib.Path, - compression: str = "uncompressed", + compression: str = Compression.ZSTD.value, compression_level: int | None = None, ) -> None: """Execute the :py:class:`DataFrame` and write the results to a Parquet file. Args: path: Path of the Parquet file to write. - compression: Compression type to use. - compression_level: Compression level to use. - """ - self.df.write_parquet(str(path), compression, compression_level) + compression: Compression type to use. Default is "ZSTD". + Available compression types are: + - "uncompressed": No compression. + - "snappy": Snappy compression. + - "gzip": Gzip compression. + - "brotli": Brotli compression. + - "lz0": LZ0 compression. + - "lz4": LZ4 compression. + - "lz4_raw": LZ4_RAW compression. + - "zstd": Zstandard compression. + compression_level: Compression level to use. For ZSTD, the + recommended range is 1 to 22, with the default being 4. Higher levels + provide better compression but slower speed. + """ + compression_enum = Compression.from_str(compression) + + if compression_enum in {Compression.GZIP, Compression.BROTLI, Compression.ZSTD}: + if compression_level is None: + compression_level = compression_enum.get_default_level() Review Comment: Rather than doing the checking here it would be slightly more ergonomic to just call `compression_enum.get_default_level()` and have it return None rather than raise an error. But I could also see how some would see calling `get_default_level` on the others as invalid. I'm not married to this idea. ########## python/datafusion/dataframe.py: ########## @@ -35,6 +35,65 @@ from datafusion._internal import DataFrame as DataFrameInternal from datafusion.expr import Expr, SortExpr, sort_or_default +from enum import Enum + + +# excerpt from deltalake +# https://github.com/apache/datafusion-python/pull/981#discussion_r1905619163 +class Compression(Enum): + """Enum representing the available compression types for Parquet files.""" + + UNCOMPRESSED = "uncompressed" + SNAPPY = "snappy" + GZIP = "gzip" + BROTLI = "brotli" + LZ4 = "lz4" + LZ0 = "lz0" + ZSTD = "zstd" + LZ4_RAW = "lz4_raw" + + @classmethod + def from_str(cls, value: str) -> "Compression": + """Convert a string to a Compression enum value. + + Args: + value (str): The string representation of the compression type. Review Comment: nit: since the type hint indicates a `str` you shouldn't have to repeat here, per the google code design spec. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org