Re: [PR] feat: Add microbenchmark for string functions [datafusion-benchmarks]

via GitHub Tue, 30 Dec 2025 19:40:41 -0800


paleolimbot commented on code in PR #26:
URL: 
https://github.com/apache/datafusion-benchmarks/pull/26#discussion_r2654734090



##########
microbenchmarks/microbenchmarks.py:
##########
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""
+Microbenchmark comparing DataFusion and DuckDB performance
+for SQL string functions on Parquet files.
+"""
+
+import tempfile
+import time
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import datafusion
+import duckdb
+
+
+@dataclass
+class BenchmarkResult:
+    """Stores benchmark results for a single function."""
+    function_name: str
+    datafusion_time_ms: float
+    duckdb_time_ms: float
+    rows: int
+
+    @property
+    def speedup(self) -> float:
+        """DuckDB time / DataFusion time (>1 means DataFusion is faster)."""
+        if self.datafusion_time_ms == 0:
+            return float('inf')
+        return self.duckdb_time_ms / self.datafusion_time_ms
+
+
+@dataclass
+class StringFunction:
+    """Defines a string function with syntax for both engines."""
+    name: str
+    datafusion_expr: str  # Expression using {col} as placeholder for column 
name
+    duckdb_expr: str      # Expression using {col} as placeholder for column 
name
+
+
+# String functions to benchmark
+# {col} will be replaced with the actual column name
+STRING_FUNCTIONS = [
+    StringFunction("trim", "trim({col})", "trim({col})"),
+    StringFunction("ltrim", "ltrim({col})", "ltrim({col})"),
+    StringFunction("rtrim", "rtrim({col})", "rtrim({col})"),
+    StringFunction("lower", "lower({col})", "lower({col})"),
+    StringFunction("upper", "upper({col})", "upper({col})"),
+    StringFunction("length", "length({col})", "length({col})"),
+    StringFunction("char_length", "char_length({col})", "length({col})"),
+    StringFunction("reverse", "reverse({col})", "reverse({col})"),
+    StringFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"),
+    StringFunction("concat", "concat({col}, {col})", "concat({col}, {col})"),
+    StringFunction("concat_ws", "concat_ws('-', {col}, {col})", 
"concat_ws('-', {col}, {col})"),
+    StringFunction("substring_1_5", "substring({col}, 1, 5)", 
"substring({col}, 1, 5)"),
+    StringFunction("left_5", "left({col}, 5)", "left({col}, 5)"),
+    StringFunction("right_5", "right({col}, 5)", "right({col}, 5)"),
+    StringFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"),
+    StringFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"),
+    StringFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 
'X')"),
+    StringFunction("translate", "translate({col}, 'aeiou', '12345')", 
"translate({col}, 'aeiou', '12345')"),
+    StringFunction("ascii", "ascii({col})", "ascii({col})"),
+    StringFunction("md5", "md5({col})", "md5({col})"),
+    StringFunction("sha256", "sha256({col})", "sha256({col})"),
+    StringFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"),
+    StringFunction("split_part", "split_part({col}, ' ', 1)", 
"split_part({col}, ' ', 1)"),
+    StringFunction("starts_with", "starts_with({col}, 'test')", 
"starts_with({col}, 'test')"),
+    StringFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 
'data')"),
+    StringFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"),
+    StringFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", 
"regexp_replace({col}, '[aeiou]', '*', 'g')"),
+]
+
+
+def generate_test_data(num_rows: int = 1_000_000) -> pa.Table:
+    """Generate test data with various string patterns."""
+    import random
+    import string
+
+    random.seed(42)  # For reproducibility
+
+    # Generate diverse string data
+    strings = []
+    for i in range(num_rows):
+        # Mix of different string patterns
+        pattern_type = i % 5
+        if pattern_type == 0:
+            # Short strings with spaces
+            s = f"  test_{i % 1000}  "
+        elif pattern_type == 1:
+            # Longer strings
+            s = ''.join(random.choices(string.ascii_lowercase, k=20))
+        elif pattern_type == 2:
+            # Mixed case with numbers
+            s = f"TestData_{i}_Value"
+        elif pattern_type == 3:
+            # Strings with special patterns
+            s = f"hello world {i % 100} data"
+        else:
+            # Random length strings
+            length = random.randint(5, 50)
+            s = ''.join(random.choices(string.ascii_letters + string.digits + 
' ', k=length))
+        strings.append(s)
+
+    table = pa.table({
+        'str_col': pa.array(strings, type=pa.string())

Review Comment:
   It may be worth trying both `string` and `string_view`. In theory 
`string_view` -> DuckDB has less overhead because the string view is closer to 
its internal representation. It also might be that DataFusion performs 
differently when one or the other is used as input.



##########
microbenchmarks/microbenchmarks.py:
##########
@@ -0,0 +1,340 @@
+#!/usr/bin/env python3
+"""
+Microbenchmark comparing DataFusion and DuckDB performance
+for SQL string functions on Parquet files.
+"""
+
+import tempfile
+import time
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import datafusion
+import duckdb
+
+
+@dataclass
+class BenchmarkResult:
+    """Stores benchmark results for a single function."""
+    function_name: str
+    datafusion_time_ms: float
+    duckdb_time_ms: float
+    rows: int
+
+    @property
+    def speedup(self) -> float:
+        """DuckDB time / DataFusion time (>1 means DataFusion is faster)."""
+        if self.datafusion_time_ms == 0:
+            return float('inf')
+        return self.duckdb_time_ms / self.datafusion_time_ms
+
+
+@dataclass
+class StringFunction:
+    """Defines a string function with syntax for both engines."""
+    name: str
+    datafusion_expr: str  # Expression using {col} as placeholder for column 
name
+    duckdb_expr: str      # Expression using {col} as placeholder for column 
name
+
+
+# String functions to benchmark
+# {col} will be replaced with the actual column name
+STRING_FUNCTIONS = [
+    StringFunction("trim", "trim({col})", "trim({col})"),
+    StringFunction("ltrim", "ltrim({col})", "ltrim({col})"),
+    StringFunction("rtrim", "rtrim({col})", "rtrim({col})"),
+    StringFunction("lower", "lower({col})", "lower({col})"),
+    StringFunction("upper", "upper({col})", "upper({col})"),
+    StringFunction("length", "length({col})", "length({col})"),
+    StringFunction("char_length", "char_length({col})", "length({col})"),
+    StringFunction("reverse", "reverse({col})", "reverse({col})"),
+    StringFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"),
+    StringFunction("concat", "concat({col}, {col})", "concat({col}, {col})"),
+    StringFunction("concat_ws", "concat_ws('-', {col}, {col})", 
"concat_ws('-', {col}, {col})"),
+    StringFunction("substring_1_5", "substring({col}, 1, 5)", 
"substring({col}, 1, 5)"),
+    StringFunction("left_5", "left({col}, 5)", "left({col}, 5)"),
+    StringFunction("right_5", "right({col}, 5)", "right({col}, 5)"),
+    StringFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"),
+    StringFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"),
+    StringFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 
'X')"),
+    StringFunction("translate", "translate({col}, 'aeiou', '12345')", 
"translate({col}, 'aeiou', '12345')"),
+    StringFunction("ascii", "ascii({col})", "ascii({col})"),
+    StringFunction("md5", "md5({col})", "md5({col})"),
+    StringFunction("sha256", "sha256({col})", "sha256({col})"),
+    StringFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"),
+    StringFunction("split_part", "split_part({col}, ' ', 1)", 
"split_part({col}, ' ', 1)"),
+    StringFunction("starts_with", "starts_with({col}, 'test')", 
"starts_with({col}, 'test')"),
+    StringFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 
'data')"),
+    StringFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"),
+    StringFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", 
"regexp_replace({col}, '[aeiou]', '*', 'g')"),
+]
+
+
+def generate_test_data(num_rows: int = 1_000_000) -> pa.Table:
+    """Generate test data with various string patterns."""
+    import random
+    import string
+
+    random.seed(42)  # For reproducibility
+
+    # Generate diverse string data
+    strings = []
+    for i in range(num_rows):
+        # Mix of different string patterns
+        pattern_type = i % 5
+        if pattern_type == 0:
+            # Short strings with spaces
+            s = f"  test_{i % 1000}  "
+        elif pattern_type == 1:
+            # Longer strings
+            s = ''.join(random.choices(string.ascii_lowercase, k=20))
+        elif pattern_type == 2:
+            # Mixed case with numbers
+            s = f"TestData_{i}_Value"
+        elif pattern_type == 3:
+            # Strings with special patterns
+            s = f"hello world {i % 100} data"
+        else:
+            # Random length strings
+            length = random.randint(5, 50)
+            s = ''.join(random.choices(string.ascii_letters + string.digits + 
' ', k=length))
+        strings.append(s)
+
+    table = pa.table({
+        'str_col': pa.array(strings, type=pa.string())
+    })
+
+    return table
+
+
+def setup_datafusion(parquet_path: str) -> datafusion.SessionContext:
+    """Create and configure DataFusion context."""
+    ctx = datafusion.SessionContext()
+    ctx.register_parquet('test_data', parquet_path)
+    return ctx
+
+
+def setup_duckdb(parquet_path: str) -> duckdb.DuckDBPyConnection:
+    """Create and configure DuckDB connection."""
+    conn = duckdb.connect(':memory:')
+    conn.execute(f"CREATE VIEW test_data AS SELECT * FROM 
read_parquet('{parquet_path}')")
+    return conn

Review Comment:
   In SedonaDB we found that there was wildly differing concurrency that 
resulted from the default settings for DataFusion and DuckDB for our micro-ish 
benchmarks. For these types of benchmarks we set DataFusion to use one 
partition and DuckDB to specifically use a single thread (we don't do this for 
more macro-scale benchmarks where we really do want to know what happens when a 
user sits down a types something against all the defaults):
   
   
https://github.com/apache/sedona-db/blob/e0e1d109480727faaf7be25923b57b4686144438/python/sedonadb/python/sedonadb/testing.py#L407-L412
   
   
https://github.com/apache/sedona-db/blob/e0e1d109480727faaf7be25923b57b4686144438/python/sedonadb/python/sedonadb/testing.py#L347-L353
   
   I might also suggest trying the config for DuckDB to return StringViews to 
see if there's any Arrow conversion overhead getting in the way (I think the 
config is `SET produce_arrow_string_view = true;`).
   
   Another thing to try is having both DuckDB and DataFusion operate on Arrow 
data from memory instead of Parquet (to make sure we're not just measuring the 
speed of the Parquet read). For DuckDB that's `SELECT ... FROM 
the_name_of_a_python_variable_that_is_a_pyarrow_table`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat: Add microbenchmark for string functions [datafusion-benchmarks]

Reply via email to