Now attached as text files (to be renamed to .py) to prevent the
mailer software from dropping them...
Bye,
Erik.
On Sat, Feb 5, 2011 at 7:05 PM, Erik Huelsmann <[email protected]> wrote:
> Yesterday or IRC, Bert, Philip and I were chatting about our SQLite
> perf issues and how Philip's findings in the past suggested that
> SQLite wasn't using its indices to optimize our queries.
>
> After searching and discussing its documentation, Philip suggested the
> -too obvious- "maybe we have the wrong indices".
>
> So, I went to work with his "fake database generator script" (attached
> as "test.py").
>
>
> The type of query we're seeing problematic performance with looks like
> the one below. The essential part is the WHERE clause.
>
> SELECT * FROM nodes WHERE wc_id = 1 AND (local_relpath = 'foo' OR
> local_relpath like 'foo%');
>
>
> We discussed 3 ways to achieve the effect of this query:
>
> 1. The query itself
> 2. The query stated as a UNION of two queries
> 3. Running the two parts of the UNION manually ourselves.
>
> Ad (1)
> This query doesn't perform as we had hoped to get from using a database.
>
> Ad (2)
> In the past, UNIONs have been explicitly removed because they were
> creating temporary tables (on disk!). However, since then we have
> changed our SQLite setup to create temporary tables in memory, so the
> option should really be re-evaluated.
>
> Ad (3)
> I'd hate to have to use two queries in all places in our source where
> we want to run queries like these. As a result, I think this scenario
> should be avoided if we can.
>
>
> So, I've created 'perf.py' to evaluate each of these scenarios,
> researching the effect on each of them under the influence of adding
> different indices.
>
> This is my finding:
>
> Scenario (1) [an AND combined with a complex OR] doesn't perform well
> under any circumstance.
>
> Scenario (2) performs differently, depending on the available indices.
>
> Scenario (3) performs roughly equal to scenario (2).
>
>
> Scenario (2) takes ~0.27 seconds to evaluate in the unmodified
> database. Adding an index on (wc_id, local_relpath) makes the
> execution time drop to ~0.000156 seconds!
>
>
> Seems Philip was right :-) We need to carefully review the indices we
> have in our database to support good performance.
>
>
> Bye,
>
>
> Erik.
>
#!/usr/bin/python
import os, sqlite3, time
c = sqlite3.connect('wcx.db')
c.execute("""pragma case_sensitive_like=1""")
c.execute("""pragma foreign_keys=on""")
c.execute("""pragma synchronous=off""")
c.execute("""pragma temp_store=memory""")
start = time.clock() # cpu clock as float in secs
#c.execute("""drop index i_wc_id_rp;""")
#c.execute("""create index i_wc_id_rp on nodes (wc_id, local_relpath);""")
print c.execute(".indices")
# strategy 1
c.execute("""select * from nodes where wc_id = 1 AND
(local_relpath like 'foo/%'
OR local_relpath = 'foo');""");
# strategy 2
#c.execute("""select * from nodes where wc_id = 1 AND local_relpath like 'foo/%'
# union select * from nodes where wc_id = 1 AND local_relpath =
'foo';""")
# strategy 3
#c.execute("""select * from nodes where wc_id = 1 AND local_relpath like
'foo/%';""")
#c.execute("""select * from nodes where wc_id = 1 AND local_relpath = 'foo';""")
end = time.clock()
print "timing: %5f\n" % (end - start)
#!/usr/bin/python
import os, sqlite3
try: os.remove('wcx.db')
except: pass
c = sqlite3.connect('wcx.db')
c.execute("""pragma case_sensitive_like=1""")
c.execute("""pragma foreign_keys=on""")
c.execute("""pragma synchronous=off""")
c.execute("""pragma temp_store=memory""")
c.execute("""create table repository (
id integer primary key autoincrement,
root text unique not null,
uuid text not null)""")
c.execute("""create index i_uuid on repository (uuid)""")
c.execute("""create index i_root on repository (root)""")
c.execute("""create table wcroot (
id integer primary key autoincrement,
local_abspath text unique)""")
c.execute("""create unique index i_local_abspath on wcroot (local_abspath)""")
c.execute("""create table nodes (
wc_id integer not null references wcroot (id),
local_relpath text not null,
op_depth integer not null,
parent_relpath text,
repos_id integer references repository (id),
repos_path text,
revision integer,
presence text not null,
depth text,
moved_here integer,
moved_to text,
kind text not null,
changed_revision integer,
changed_date integer,
changed_author text,
checksum text
properties blob,
translated_size integer,
last_mod_time integer,
dav_cache blob,
symlink_target text,
file_external text,
primary key(wc_id, local_relpath, op_depth))""")
c.execute("""create index i_parent on nodes (wc_id,
parent_relpath,
local_relpath, op_depth)""")
c.execute("""create table lock (
repos_id integer not null references repository (id),
repos_relpath text not null,
lock_token text not null,
lock_owner text,
lock_comment text,
lock_date integer,
primary key (repos_id, repos_relpath))""")
c.execute("""insert into repository (root, uuid) values (
"http://example.com/repo",
"f738be9e-409d-481f-b246-1fb6a969aba2")""")
c.execute("""insert into wcroot(local_abspath) values ("/wc")""")
c.execute("""insert into nodes (
wc_id,
local_relpath,
op_depth,
repos_id,
repos_path,
parent_relpath,
presence,
kind)
values (
1,
"",
0,
1,
"trunk",
NULL,
"normal",
"dir")""")
for i in range(100):
c.execute("""insert into nodes (
wc_id,
local_relpath,
op_depth,
repos_id,
repos_path,
parent_relpath,
presence,
kind)
values (
1,
"foo"""+str(i)+"""",
0,
1,
"trunk/foo"""+str(i)+"""",
"",
"normal",
"file")""")
if i >= 60:
continue;
c.execute("""insert into nodes (
wc_id,
local_relpath,
op_depth,
repos_id,
repos_path,
parent_relpath,
presence,
kind)
values (
1,
"zag"""+str(i)+"""",
0,
1,
"trunk/zag"""+str(i)+"""",
"",
"normal",
"dir")""")
c.execute("""insert into nodes (
wc_id,
local_relpath,
op_depth,
repos_id,
repos_path,
parent_relpath,
presence,
kind)
values (
1,
"zig"""+str(i)+"""",
0,
1,
"trunk/zig"""+str(i)+"""",
"",
"normal",
"dir")""")
for j in range(100):
c.execute("""insert into nodes (
wc_id,
local_relpath,
op_depth,
repos_id,
repos_path,
parent_relpath,
presence,
kind)
values (
1,
"zag"""+str(i)+"/foo"+str(j)+"""",
0,
1,
"trunk/zag"""+str(i)+"/foo"+str(j)+"""",
"zag"""+str(i)+"""",
"normal",
"file")""")
if j % 10 == 1:
c.execute("""insert into nodes (
wc_id,
local_relpath,
op_depth,
repos_id,
repos_path,
parent_relpath,
presence,
kind)
values (
1,
"zag"""+str(i)+"/foo"+str(j)+"""",
3,
1,
"trunk/zag"""+str(i)+"/foo"+str(j)+"""",
"zag"""+str(i)+"""",
"base-delete",
"file")""")
c.execute("""insert into nodes (
wc_id,
local_relpath,
op_depth,
repos_id,
repos_path,
parent_relpath,
presence,
kind)
values (
1,
"zag"""+str(i)+"/bar"+str(j)+"""",
3,
null,
null,
"zag"""+str(i)+"""",
"normal",
"file")""")
c.execute("""insert into nodes (
wc_id,
local_relpath,
op_depth,
repos_id,
repos_path,
parent_relpath,
presence,
kind)
values (
1,
"zig"""+str(i)+"/foo"+str(j)+"""",
0,
1,
"trunk/zig"""+str(i)+"/foo"+str(j)+"""",
"zig"""+str(i)+"""",
"normal",
"file")""")
if j >= 60:
continue
c.execute("""insert into nodes (
wc_id,
local_relpath,
op_depth,
repos_id,
repos_path,
parent_relpath,
presence,
kind)
values (
1,
"zig"""+str(i)+"/zag"+str(j)+"""",
0,
1,
"trunk/zig"""+str(i)+"/zag"+str(j)+"""",
"zig"""+str(i)+"""",
"normal",
"dir")""")
for k in range(100):
c.execute("""insert into nodes (
wc_id,
local_relpath,
op_depth,
repos_id,
repos_path,
parent_relpath,
presence,
kind)
values (
1,
"zig"""+str(i)+"/zag"+str(j)+"/foo"+str(k)+"""",
0,
1,
"trunk/zig"""+str(i)+"/zag"+str(j)+"/foo"+str(k)+"""",
"zig"""+str(i)+"/zag"+str(j)+"""",
"normal",
"file")""")
c.commit()