Julian Foad <julian.f...@wandisco.com> writes: >> I believe the answer is "often". A simple 'svn status' will need to >> distinguish between 'A' and 'R', and that is done by checking >> prior_deleted. >> >> And no... this amount of denormalization will not hurt us. > > OK. I know that "svn status" speed is a big deal.
I'm not sure prior_deleted is an optimisation. When I last looked at the performance of SQLite I concluded that status would be too slow if it ran per-node queries, it has to run per-dir queries. So SELECT ... FROM BASE_NODE WHERE wc_id = ?1 AND local_relpath = ?2 is too slow, we need to run SELECT ... FROM BASE_NODE WHERE wc_id = ?1 AND parent_relpath = ?2 iterate over the rows caching the data and then generate the status for each child. NODES and op_depth adds a complication. Getting the equivalent of BASE_NODE is easy, we add "AND op_depth = 0", but status wants the row with the highest op_depth for each node, and that's a different number for each row in a per-dir query. We can do it like this: SELECT ... FROM NODES n WHERE wc_id = ?1 AND parent_relpath = ?2 AND op_depth = (SELECT MAX(op_depth) FROM NODES m WHERE m.wc_id = n.wc_id AND m.local_relpath = n.local_relpath) but that nested SELECT is expensive. It's not as bad as doing per-node queries but it is still too slow, the database query alone is slower than 1.6 status. I don't think this is something that can be fixed using an index because on my test data the runtime generally goes up by orders of magnitude when the indexing is wrong. I can get round this by querying for all op_depth and using the cache to select the highest. The cache is a hash, keyed on local_relpath, that stores the data associated with the highest op_depth seen and it simply discards/overwrites lower op_depth. I've prototyped this and it's as fast as the per-dir BASE_NODE query on my test data. This is not surprising since my test data consists of mostly single op_depth with a few double op_depth. We have to have good performance on such working copies because they are the most common, I think it will be unusual to have a large working copy where lots of nodes have a large number of op_depth. Now to prior_deleted. The fundamental status query looks like SELECT ... FROM NODES WHERE wc_id = ?1 AND parent_relpath = ?2 and all op_depth are seen. It's quite simple to cache the two highest op_depth so prior_deleted only provides information that is already available. It's not an optimisation for status, if anything it will make the database bigger and slower. Below are the Python script that generates a big NODES database, and a C program that prototypes the status queries: #!/usr/bin/python import os, sqlite3 try: os.remove('wcx.db') except: pass c = sqlite3.connect('wcx.db') c.execute("""create table repository ( id integer primary key autoincrement, root text unique not null, uuid text not null)""") c.execute("""create index i_uuid on repository (uuid)""") c.execute("""create index i_root on repository (root)""") c.execute("""create table wcroot ( id integer primary key autoincrement, local_abspath text unique)""") c.execute("""create unique index i_local_abspath on wcroot (local_abspath)""") c.execute("""create table nodes ( wc_id integer not null references wcroot (id), local_relpath text not null, op_depth integer not null, parent_relpath text, repos_id integer references repository (id), repos_path text, revision integer, presence text not null, depth text, moved_here integer, moved_to text, kind text not null, changed_revision integer, changed_date integer, changed_author text, checksum text properties blob, translated_size integer, last_mod_time integer, dav_cache blob, symlink_target text, file_external text, primary key(wc_id, local_relpath, op_depth))""") c.execute("""create index i_parent on nodes (wc_id, parent_relpath, local_relpath)""") c.execute("""insert into repository (root, uuid) values ( "http://example.com/repo", "f738be9e-409d-481f-b246-1fb6a969aba2")""") c.execute("""insert into wcroot(local_abspath) values ("/wc")""") c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "", 0, 1, "trunk", NULL, "normal", "dir")""") for i in range(100): c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "foo"""+str(i)+"""", 0, 1, "trunk/foo"""+str(i)+"""", "", "normal", "file")""") if i >= 30: continue; c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zag"""+str(i)+"""", 0, 1, "trunk/zag"""+str(i)+"""", "", "normal", "dir")""") c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zig"""+str(i)+"""", 0, 1, "trunk/zig"""+str(i)+"""", "", "normal", "dir")""") for j in range(100): c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zag"""+str(i)+"/foo"+str(j)+"""", 0, 1, "trunk/zag"""+str(i)+"/foo"+str(j)+"""", "zag"""+str(i)+"""", "normal", "file")""") if j % 10 == 1: c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zag"""+str(i)+"/foo"+str(j)+"""", 3, 1, "trunk/zag"""+str(i)+"/foo"+str(j)+"""", "zag"""+str(i)+"""", "base-delete", "file")""") c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zag"""+str(i)+"/bar"+str(j)+"""", 3, null, null, "zag"""+str(i)+"""", "normal", "file")""") c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zig"""+str(i)+"/foo"+str(j)+"""", 0, 1, "trunk/zig"""+str(i)+"/foo"+str(j)+"""", "zig"""+str(i)+"""", "normal", "file")""") if j >= 30: continue c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zig"""+str(i)+"/zag"+str(j)+"""", 0, 1, "trunk/zig"""+str(i)+"/zag"+str(j)+"""", "zig"""+str(i)+"""", "normal", "dir")""") for k in range(100): c.execute("""insert into nodes ( wc_id, local_relpath, op_depth, repos_id, repos_path, parent_relpath, presence, kind) values ( 1, "zig"""+str(i)+"/zag"+str(j)+"/foo"+str(k)+"""", 0, 1, "trunk/zig"""+str(i)+"/zag"+str(j)+"/foo"+str(k)+"""", "zig"""+str(i)+"/zag"+str(j)+"""", "normal", "file")""") c.commit() #include "svn_pools.h" #include "svn_sqlite.h" #include <stdio.h> static svn_error_t * status_query_per_node(svn_sqlite__db_t *sdb, const char *local_relpath, svn_boolean_t display, apr_pool_t *pool) { svn_sqlite__stmt_t *stmt; svn_boolean_t have_row; const char *kind; apr_pool_t *subpool; apr_array_header_t *children; int num, i; /* Display this node. */ SVN_ERR(svn_sqlite__get_statement(&stmt, sdb, 0)); SVN_ERR(svn_sqlite__bindf(stmt, "is", 1, local_relpath)); SVN_ERR(svn_sqlite__step(&have_row, stmt)); if (!have_row) { SVN_ERR(svn_sqlite__reset(stmt)); return SVN_NO_ERROR; } kind = svn_sqlite__column_text(stmt, 0, pool); SVN_ERR(svn_sqlite__reset(stmt)); if (display) printf("%s %s\n", local_relpath, kind); if (strcmp(kind, "dir")) return SVN_NO_ERROR; /* Gather children. */ #if 0 SVN_ERR(svn_sqlite__get_statement(&stmt, sdb, 2)); SVN_ERR(svn_sqlite__bindf(stmt, "is", 1, local_relpath)); SVN_ERR(svn_sqlite__step(&have_row, stmt)); if (!have_row) { SVN_ERR(svn_sqlite__reset(stmt)); return SVN_NO_ERROR; } num = svn_sqlite__column_int64(stmt, 0); SVN_ERR(svn_sqlite__reset(stmt)); #else num = 10; #endif children = apr_array_make(pool, num, sizeof(const char *)); SVN_ERR(svn_sqlite__get_statement(&stmt, sdb, 5)); SVN_ERR(svn_sqlite__bindf(stmt, "is", 1, local_relpath)); SVN_ERR(svn_sqlite__step(&have_row, stmt)); while (have_row) { APR_ARRAY_PUSH(children, const char *) = svn_sqlite__column_text(stmt, 0, pool); SVN_ERR(svn_sqlite__step(&have_row, stmt)); } SVN_ERR(svn_sqlite__reset(stmt)); /* Display children. */ subpool = svn_pool_create(pool); for (i = 0; i < children->nelts; ++i) { const char *child_relpath = APR_ARRAY_IDX(children, i, const char *); svn_pool_clear(subpool); SVN_ERR(status_query_per_node(sdb, child_relpath, display, subpool)); } svn_pool_destroy(subpool); return SVN_NO_ERROR; } struct node_t { svn_boolean_t is_dir; int op_depth; }; static svn_error_t * status_query_per_dir(svn_sqlite__db_t *sdb, const char *local_relpath, svn_boolean_t display, apr_pool_t *pool) { svn_sqlite__stmt_t *stmt; svn_boolean_t have_row; const char *kind; apr_pool_t *subpool; apr_hash_t *children = apr_hash_make(pool); apr_hash_index_t *hi; /* Display this node. */ SVN_ERR(svn_sqlite__get_statement(&stmt, sdb, 0)); SVN_ERR(svn_sqlite__bindf(stmt, "is", 1, local_relpath)); SVN_ERR(svn_sqlite__step(&have_row, stmt)); if (!have_row) { SVN_ERR(svn_sqlite__reset(stmt)); return SVN_NO_ERROR; } kind = svn_sqlite__column_text(stmt, 0, pool); SVN_ERR(svn_sqlite__reset(stmt)); if (display) printf("%s %s\n", local_relpath, kind); if (strcmp(kind, "dir")) return SVN_NO_ERROR; /* Gather children. */ SVN_ERR(svn_sqlite__get_statement(&stmt, sdb, 1)); SVN_ERR(svn_sqlite__bindf(stmt, "is", 1, local_relpath)); SVN_ERR(svn_sqlite__step(&have_row, stmt)); while (have_row) { const char *child_relpath = svn_sqlite__column_text(stmt, 0, NULL); int op_depth = svn_sqlite__column_int64(stmt, 2); struct node_t *node; node = apr_hash_get(children, child_relpath, APR_HASH_KEY_STRING); if (!node) { int klen = strlen(child_relpath); node = apr_palloc(pool, sizeof(*node)); node->op_depth = -1; apr_hash_set(children, apr_pstrmemdup(pool, child_relpath, klen), klen, node); } if (op_depth > node->op_depth) { node->op_depth = op_depth; node->is_dir = !strcmp("dir", svn_sqlite__column_text(stmt, 1, NULL)); } SVN_ERR(svn_sqlite__step(&have_row, stmt)); } SVN_ERR(svn_sqlite__reset(stmt)); /* Display children. */ subpool = svn_pool_create(pool); for (hi = apr_hash_first(pool, children); hi; hi = apr_hash_next(hi)) { struct node_t *node; svn_pool_clear(subpool); node = svn__apr_hash_index_val(hi); if (node->is_dir) SVN_ERR(status_query_per_dir(sdb, svn__apr_hash_index_key(hi), display, subpool)); else if (display) printf("%s file\n", (const char*)svn__apr_hash_index_key(hi)); } svn_pool_destroy(subpool); return SVN_NO_ERROR; } static void usage(const char *arg) { fprintf(stderr, "usage: %s n|d q|v s|m db\n" "where: n does per-node queries\n" " d does per-dir queries\n" " q suppresses output\n" " v displays ouptut\n" " s use a single transaction\n" " m uses multiple transactions\n", arg); exit(1); } static void parse_opts(int argc, const char *argv[], svn_boolean_t *per_dir, svn_boolean_t *display, svn_boolean_t *savepoint) { if (argc != 5) usage(argv[0]); if (!strcmp(argv[1], "n")) *per_dir = FALSE; else if (!strcmp(argv[1], "d")) *per_dir = TRUE; else usage(argv[0]); if (!strcmp(argv[2], "q")) *display = FALSE; else if (!strcmp(argv[2], "v")) *display = TRUE; else usage(argv[0]); if (!strcmp(argv[3], "s")) *savepoint = TRUE; else if (!strcmp(argv[3], "m")) *savepoint = FALSE; else usage(argv[0]); } int main(int argc, const char *argv[]) { apr_pool_t *pool; svn_sqlite__db_t *sdb; svn_boolean_t per_dir, display, savepoint; const char * const statements[] = { /* 0 */ "select kind from nodes" " where wc_id = ?1 and local_relpath = ?2 order by op_depth desc limit 1", /* 1 */ "select local_relpath, kind, op_depth from nodes" " where wc_id = ?1 and parent_relpath = ?2", /* 2 */ "select count(*) from (select distinct local_relpath from nodes" " where wc_id = ?1 and parent_relpath = ?2)", /* 3 */ "savepoint q", /* 4 */ "release savepoint q", /* 5 */ "select distinct local_relpath from nodes" " where wc_id = ?1 and parent_relpath = ?2", /* 6 */ "select local_relpath from nodes" " where wc_id = ?1 and parent_relpath = ?2", /* 7 */ "select local_relpath, kind, op_depth from nodes n" " where wc_id = ?1 and parent_relpath = ?2" " and op_depth = (select max(op_depth) from nodes m" " where m.wc_id = n.wc_id" " and m.local_relpath = n.local_relpath)", NULL }; parse_opts(argc, argv, &per_dir, &display, &savepoint); apr_initialize(); pool = svn_pool_create(NULL); SVN_INT_ERR(svn_sqlite__open(&sdb, argv[argc - 1], svn_sqlite__mode_rwcreate, statements, 0, NULL, pool, pool)); if (savepoint) SVN_INT_ERR(svn_sqlite__exec_statements(sdb, 3)); if (per_dir) SVN_INT_ERR(status_query_per_dir(sdb, "", display, pool)); else SVN_INT_ERR(status_query_per_node(sdb, "", display, pool)); if (savepoint) SVN_INT_ERR(svn_sqlite__exec_statements(sdb, 4)); return EXIT_SUCCESS; } -- Philip