UBarney commented on issue #16710: URL: https://github.com/apache/datafusion/issues/16710#issuecomment-3084107356
> Updated: our benchmark is using datafusion internal source to benchmark instead of datafusion-python, i am not sure if it will make a difference. The results are similar when running with datafusion-python as well. <details> ``` import time import datafusion ctx = datafusion.SessionContext() pathes = "/home/lv/code/datafusion/benchmarks/data/h2o/J1_1e8_NA_0.csv,/home/lv/code/datafusion/benchmarks/data/h2o/J1_1e8_1e2_0.csv,/home/lv/code/datafusion/benchmarks/data/h2o/J1_1e8_1e5_0.csv,/home/lv/code/datafusion/benchmarks/data/h2o/J1_1e8_1e8_NA.csv".split( "," ) table_names = ["x", "small", "medium", "large"] for path, table_name in zip(pathes, table_names): ctx.register_csv(name=table_name, path=path, has_header=True) sqls = """ SELECT x.id1, x.id2, x.id3, x.id4 as xid4, small.id4 as smallid4, x.id5, x.id6, x.v1, small.v2 FROM x INNER JOIN small ON x.id1 = small.id1; SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x INNER JOIN medium ON x.id2 = medium.id2; SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x LEFT JOIN medium ON x.id2 = medium.id2; SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4, medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1, medium.v2 FROM x JOIN medium ON x.id5 = medium.id5; """ for sql in sqls.split("\n"): sql = sql.strip() if len(sql) == 0: continue total_time = 0 for _ in range(5): start = time.time() ctx.sql(sql).collect() end = time.time() total_time += end - start print("total_time / 5") ``` output ``` 0.0038166046142578125 0.014970111846923827 0.012476062774658203 0.010909128189086913 ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org