UBarney commented on issue #16710:
URL: https://github.com/apache/datafusion/issues/16710#issuecomment-3084107356
> Updated: our benchmark is using datafusion internal source to benchmark
instead of datafusion-python, i am not sure if it will make a difference.
The results are similar when running with datafusion-python as well.
<details>
```
import time
import datafusion
ctx = datafusion.SessionContext()
pathes =
"/home/lv/code/datafusion/benchmarks/data/h2o/J1_1e8_NA_0.csv,/home/lv/code/datafusion/benchmarks/data/h2o/J1_1e8_1e2_0.csv,/home/lv/code/datafusion/benchmarks/data/h2o/J1_1e8_1e5_0.csv,/home/lv/code/datafusion/benchmarks/data/h2o/J1_1e8_1e8_NA.csv".split(
","
)
table_names = ["x", "small", "medium", "large"]
for path, table_name in zip(pathes, table_names):
ctx.register_csv(name=table_name, path=path, has_header=True)
sqls = """
SELECT x.id1, x.id2, x.id3, x.id4 as xid4, small.id4 as smallid4, x.id5,
x.id6, x.v1, small.v2 FROM x INNER JOIN small ON x.id1 = small.id1;
SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4,
medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1,
medium.v2 FROM x INNER JOIN medium ON x.id2 = medium.id2;
SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4,
medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1,
medium.v2 FROM x LEFT JOIN medium ON x.id2 = medium.id2;
SELECT x.id1 as xid1, medium.id1 as mediumid1, x.id2, x.id3, x.id4 as xid4,
medium.id4 as mediumid4, x.id5 as xid5, medium.id5 as mediumid5, x.id6, x.v1,
medium.v2 FROM x JOIN medium ON x.id5 = medium.id5;
"""
for sql in sqls.split("\n"):
sql = sql.strip()
if len(sql) == 0:
continue
total_time = 0
for _ in range(5):
start = time.time()
ctx.sql(sql).collect()
end = time.time()
total_time += end - start
print("total_time / 5")
```
output
```
0.0038166046142578125
0.014970111846923827
0.012476062774658203
0.010909128189086913
```
</details>
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]