paleolimbot commented on code in PR #26:
URL:
https://github.com/apache/datafusion-benchmarks/pull/26#discussion_r2654734090
##########
microbenchmarks/microbenchmarks.py:
##########
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""
+Microbenchmark comparing DataFusion and DuckDB performance
+for SQL string functions on Parquet files.
+"""
+
+import tempfile
+import time
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import datafusion
+import duckdb
+
+
+@dataclass
+class BenchmarkResult:
+ """Stores benchmark results for a single function."""
+ function_name: str
+ datafusion_time_ms: float
+ duckdb_time_ms: float
+ rows: int
+
+ @property
+ def speedup(self) -> float:
+ """DuckDB time / DataFusion time (>1 means DataFusion is faster)."""
+ if self.datafusion_time_ms == 0:
+ return float('inf')
+ return self.duckdb_time_ms / self.datafusion_time_ms
+
+
+@dataclass
+class StringFunction:
+ """Defines a string function with syntax for both engines."""
+ name: str
+ datafusion_expr: str # Expression using {col} as placeholder for column
name
+ duckdb_expr: str # Expression using {col} as placeholder for column
name
+
+
+# String functions to benchmark
+# {col} will be replaced with the actual column name
+STRING_FUNCTIONS = [
+ StringFunction("trim", "trim({col})", "trim({col})"),
+ StringFunction("ltrim", "ltrim({col})", "ltrim({col})"),
+ StringFunction("rtrim", "rtrim({col})", "rtrim({col})"),
+ StringFunction("lower", "lower({col})", "lower({col})"),
+ StringFunction("upper", "upper({col})", "upper({col})"),
+ StringFunction("length", "length({col})", "length({col})"),
+ StringFunction("char_length", "char_length({col})", "length({col})"),
+ StringFunction("reverse", "reverse({col})", "reverse({col})"),
+ StringFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"),
+ StringFunction("concat", "concat({col}, {col})", "concat({col}, {col})"),
+ StringFunction("concat_ws", "concat_ws('-', {col}, {col})",
"concat_ws('-', {col}, {col})"),
+ StringFunction("substring_1_5", "substring({col}, 1, 5)",
"substring({col}, 1, 5)"),
+ StringFunction("left_5", "left({col}, 5)", "left({col}, 5)"),
+ StringFunction("right_5", "right({col}, 5)", "right({col}, 5)"),
+ StringFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"),
+ StringFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"),
+ StringFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a',
'X')"),
+ StringFunction("translate", "translate({col}, 'aeiou', '12345')",
"translate({col}, 'aeiou', '12345')"),
+ StringFunction("ascii", "ascii({col})", "ascii({col})"),
+ StringFunction("md5", "md5({col})", "md5({col})"),
+ StringFunction("sha256", "sha256({col})", "sha256({col})"),
+ StringFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"),
+ StringFunction("split_part", "split_part({col}, ' ', 1)",
"split_part({col}, ' ', 1)"),
+ StringFunction("starts_with", "starts_with({col}, 'test')",
"starts_with({col}, 'test')"),
+ StringFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col},
'data')"),
+ StringFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"),
+ StringFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')",
"regexp_replace({col}, '[aeiou]', '*', 'g')"),
+]
+
+
+def generate_test_data(num_rows: int = 1_000_000) -> pa.Table:
+ """Generate test data with various string patterns."""
+ import random
+ import string
+
+ random.seed(42) # For reproducibility
+
+ # Generate diverse string data
+ strings = []
+ for i in range(num_rows):
+ # Mix of different string patterns
+ pattern_type = i % 5
+ if pattern_type == 0:
+ # Short strings with spaces
+ s = f" test_{i % 1000} "
+ elif pattern_type == 1:
+ # Longer strings
+ s = ''.join(random.choices(string.ascii_lowercase, k=20))
+ elif pattern_type == 2:
+ # Mixed case with numbers
+ s = f"TestData_{i}_Value"
+ elif pattern_type == 3:
+ # Strings with special patterns
+ s = f"hello world {i % 100} data"
+ else:
+ # Random length strings
+ length = random.randint(5, 50)
+ s = ''.join(random.choices(string.ascii_letters + string.digits +
' ', k=length))
+ strings.append(s)
+
+ table = pa.table({
+ 'str_col': pa.array(strings, type=pa.string())
Review Comment:
It may be worth trying both `string` and `string_view`. In theory
`string_view` -> DuckDB has less overhead because the string view is closer to
its internal representation. It also might be that DataFusion performs
differently when one or the other is used as input.
##########
microbenchmarks/microbenchmarks.py:
##########
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""
+Microbenchmark comparing DataFusion and DuckDB performance
+for SQL string functions on Parquet files.
+"""
+
+import tempfile
+import time
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import datafusion
+import duckdb
+
+
+@dataclass
+class BenchmarkResult:
+ """Stores benchmark results for a single function."""
+ function_name: str
+ datafusion_time_ms: float
+ duckdb_time_ms: float
+ rows: int
+
+ @property
+ def speedup(self) -> float:
+ """DuckDB time / DataFusion time (>1 means DataFusion is faster)."""
+ if self.datafusion_time_ms == 0:
+ return float('inf')
+ return self.duckdb_time_ms / self.datafusion_time_ms
+
+
+@dataclass
+class StringFunction:
+ """Defines a string function with syntax for both engines."""
+ name: str
+ datafusion_expr: str # Expression using {col} as placeholder for column
name
+ duckdb_expr: str # Expression using {col} as placeholder for column
name
+
+
+# String functions to benchmark
+# {col} will be replaced with the actual column name
+STRING_FUNCTIONS = [
+ StringFunction("trim", "trim({col})", "trim({col})"),
+ StringFunction("ltrim", "ltrim({col})", "ltrim({col})"),
+ StringFunction("rtrim", "rtrim({col})", "rtrim({col})"),
+ StringFunction("lower", "lower({col})", "lower({col})"),
+ StringFunction("upper", "upper({col})", "upper({col})"),
+ StringFunction("length", "length({col})", "length({col})"),
+ StringFunction("char_length", "char_length({col})", "length({col})"),
+ StringFunction("reverse", "reverse({col})", "reverse({col})"),
+ StringFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"),
+ StringFunction("concat", "concat({col}, {col})", "concat({col}, {col})"),
+ StringFunction("concat_ws", "concat_ws('-', {col}, {col})",
"concat_ws('-', {col}, {col})"),
+ StringFunction("substring_1_5", "substring({col}, 1, 5)",
"substring({col}, 1, 5)"),
+ StringFunction("left_5", "left({col}, 5)", "left({col}, 5)"),
+ StringFunction("right_5", "right({col}, 5)", "right({col}, 5)"),
+ StringFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"),
+ StringFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"),
+ StringFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a',
'X')"),
+ StringFunction("translate", "translate({col}, 'aeiou', '12345')",
"translate({col}, 'aeiou', '12345')"),
+ StringFunction("ascii", "ascii({col})", "ascii({col})"),
+ StringFunction("md5", "md5({col})", "md5({col})"),
+ StringFunction("sha256", "sha256({col})", "sha256({col})"),
+ StringFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"),
+ StringFunction("split_part", "split_part({col}, ' ', 1)",
"split_part({col}, ' ', 1)"),
+ StringFunction("starts_with", "starts_with({col}, 'test')",
"starts_with({col}, 'test')"),
+ StringFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col},
'data')"),
+ StringFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"),
+ StringFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')",
"regexp_replace({col}, '[aeiou]', '*', 'g')"),
+]
+
+
+def generate_test_data(num_rows: int = 1_000_000) -> pa.Table:
+ """Generate test data with various string patterns."""
+ import random
+ import string
+
+ random.seed(42) # For reproducibility
+
+ # Generate diverse string data
+ strings = []
+ for i in range(num_rows):
+ # Mix of different string patterns
+ pattern_type = i % 5
+ if pattern_type == 0:
+ # Short strings with spaces
+ s = f" test_{i % 1000} "
+ elif pattern_type == 1:
+ # Longer strings
+ s = ''.join(random.choices(string.ascii_lowercase, k=20))
+ elif pattern_type == 2:
+ # Mixed case with numbers
+ s = f"TestData_{i}_Value"
+ elif pattern_type == 3:
+ # Strings with special patterns
+ s = f"hello world {i % 100} data"
+ else:
+ # Random length strings
+ length = random.randint(5, 50)
+ s = ''.join(random.choices(string.ascii_letters + string.digits +
' ', k=length))
+ strings.append(s)
+
+ table = pa.table({
+ 'str_col': pa.array(strings, type=pa.string())
+ })
+
+ return table
+
+
+def setup_datafusion(parquet_path: str) -> datafusion.SessionContext:
+ """Create and configure DataFusion context."""
+ ctx = datafusion.SessionContext()
+ ctx.register_parquet('test_data', parquet_path)
+ return ctx
+
+
+def setup_duckdb(parquet_path: str) -> duckdb.DuckDBPyConnection:
+ """Create and configure DuckDB connection."""
+ conn = duckdb.connect(':memory:')
+ conn.execute(f"CREATE VIEW test_data AS SELECT * FROM
read_parquet('{parquet_path}')")
+ return conn
Review Comment:
In SedonaDB we found that there was wildly differing concurrency that
resulted from the default settings for DataFusion and DuckDB for our micro-ish
benchmarks. For these types of benchmarks we set DataFusion to use one
partition and DuckDB to specifically use a single thread (we don't do this for
more macro-scale benchmarks where we really do want to know what happens when a
user sits down a types something against all the defaults):
https://github.com/apache/sedona-db/blob/e0e1d109480727faaf7be25923b57b4686144438/python/sedonadb/python/sedonadb/testing.py#L407-L412
https://github.com/apache/sedona-db/blob/e0e1d109480727faaf7be25923b57b4686144438/python/sedonadb/python/sedonadb/testing.py#L347-L353
I might also suggest trying the config for DuckDB to return StringViews to
see if there's any Arrow conversion overhead getting in the way (I think the
config is `SET produce_arrow_string_view = true;`).
Another thing to try is having both DuckDB and DataFusion operate on Arrow
data from memory instead of Parquet (to make sure we're not just measuring the
speed of the Parquet read). For DuckDB that's `SELECT ... FROM
the_name_of_a_python_variable_that_is_a_pyarrow_table`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]