Kimahriman commented on PR #32987:
URL: https://github.com/apache/spark/pull/32987#issuecomment-4006359564

   Wrote a benchmark for this
   
   ```scala
   object ConditionalSubexpressionBenchmark extends SqlBasedBenchmark {
     private val N = 5 * 1000 * 1000
   
     private def runCaseWhenRegexBenchmark(groupName: String, df: Dataset[_]): 
Unit = {
       val benchmark = new Benchmark(groupName, N, output = output)
   
       val extracted = regexp_extract(col("s"), "(\\d+)", 1)
       val conditionalRegexExpr = when(extracted =!= lit(""), extracted)
   
       df.persist(StorageLevel.MEMORY_ONLY).count()
   
       benchmark.addCase("conditionals disabled", 10) { _ =>
         withSQLConf(SQLConf.SUBEXPRESSION_ELIMINATION_CONDITIONALS_ENABLED.key 
-> "false") {
           df.select(conditionalRegexExpr.alias("value")).noop()
         }
       }
   
       benchmark.addCase("conditionals enabled", 10) { _ =>
         withSQLConf(SQLConf.SUBEXPRESSION_ELIMINATION_CONDITIONALS_ENABLED.key 
-> "true") {
           df.select(conditionalRegexExpr.alias("value")).noop()
         }
       }
   
       benchmark.run()
   
       df.unpersist()
     }
   
     override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
       val alwaysMatchDf = spark.range(N).select(
         concat(lit("spark-"), (col("id") % 1000).cast("string"), 
lit("-sql")).alias("s"))
       runCaseWhenRegexBenchmark(
         "Conditional regex subexpression elimination (CASE always matches)",
         alwaysMatchDf)
   
       val neverMatchDf = 
spark.range(N).select(lit("spark-no-digits-sql").alias("s"))
       runCaseWhenRegexBenchmark(
         "Conditional regex subexpression elimination (CASE never matches)",
         neverMatchDf)
     }
   }
   ```
   
   ```
   [info] Running benchmark: Conditional regex subexpression elimination (CASE 
always matches)
   [info]   Running case: conditionals disabled
   [info]   Stopped after 10 iterations, 26754 ms
   [info]   Running case: conditionals enabled
   [info]   Stopped after 10 iterations, 14592 ms
   [info] OpenJDK 64-Bit Server VM 21.0.10 on Mac OS X 26.3
   [info] Apple M1 Max
   [info] Conditional regex subexpression elimination (CASE always matches):  
Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   [info] 
-------------------------------------------------------------------------------------------------------------------------------------------------
   [info] conditionals disabled                                                 
      2642           2675          50          1.9         528.4       1.0X
   [info] conditionals enabled                                                  
      1447           1459           9          3.5         289.4       1.8X
   [info] Running benchmark: Conditional regex subexpression elimination (CASE 
never matches)
   [info]   Running case: conditionals disabled
   [info]   Stopped after 10 iterations, 15458 ms
   [info]   Running case: conditionals enabled
   [info]   Stopped after 10 iterations, 15703 ms
   [info] OpenJDK 64-Bit Server VM 21.0.10 on Mac OS X 26.3
   [info] Apple M1 Max
   [info] Conditional regex subexpression elimination (CASE never matches):  
Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
   [info] 
------------------------------------------------------------------------------------------------------------------------------------------------
   [info] conditionals disabled                                                 
     1538           1546           6          3.3         307.6       1.0X
   [info] conditionals enabled                                                  
     1562           1570           5          3.2         312.4       1.0X
   ```
   
   Demonstrates the clear speedup and near negligible worst-case scenario where 
the second case is never evaluated. In some cases the conditionals enabled case 
was even faster in the worst case so it's basically a tossup. It seems like the 
only hesitation was that worst-case where the expression was moved to a 
function instead of inlined, resulting in additional function calls. But 
considering there are a lot of benefits to splitting code into functions for 
more JIT compilable functions, this seems like a non-issue. And even thinks 
like simple math expressions still call functions instead of directly inlining 
the operation.
   
   While this example could be handled by the special `With` expression that 
exists now for `nullif` and `between`, this has the benefit of working for all 
types of expressions, and would lead into working with the subexpression 
elimination inside higher order functions I have working in 
https://github.com/apache/spark/pull/51272


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to