Changeset: e1dd2e16216e for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e1dd2e16216e Removed Files: sql/backends/monet5/bam/Tests/load.sql Modified Files: sql/backends/monet5/bam/Tests/test.SQL.py sql/backends/monet5/bam/Tests/test.stable.err sql/backends/monet5/bam/Tests/test.stable.out Branch: bamloader Log Message:
Improved test greatly diffs (truncated from 4768 to 300 lines): diff --git a/sql/backends/monet5/bam/Tests/load.sql b/sql/backends/monet5/bam/Tests/load.sql deleted file mode 100644 --- a/sql/backends/monet5/bam/Tests/load.sql +++ /dev/null @@ -1,2 +0,0 @@ -CALL bam_loader_repos('PWD/files', 0, 32); -CALL bam_loader_repos('PWD/files/queryname', 1, 32); diff --git a/sql/backends/monet5/bam/Tests/test.SQL.py b/sql/backends/monet5/bam/Tests/test.SQL.py --- a/sql/backends/monet5/bam/Tests/test.SQL.py +++ b/sql/backends/monet5/bam/Tests/test.SQL.py @@ -13,38 +13,119 @@ def new_client(): def set_var(c, name, value, numeric=True): if numeric: - c.stdin.write('DECLARE %s integer;'% name) - c.stdin.write('SET %s=%s;'%(name, value)) + c.stdin.write('DECLARE %s integer;\n'% name) + c.stdin.write('SET %s=%s;\n'%(name, value)) else: - c.stdin.write('DECLARE %s varchar(32);'% name) - c.stdin.write('SET %s=\'%s\';'% (name, value)) + c.stdin.write('DECLARE %s varchar(32);\n'% name) + c.stdin.write('SET %s=\'%s\';\n'% (name, value)) -# start with loading bam files into the database + + + + +# start with loading bam files into the database with all loader functions we have built +# we assume that all bam or sam files are stored into the files directory. +# these files will all be loaded into the database with all loading methods that we have. +# files in the directory files/queryname are assumed to be ordered by queryname and are +# therefore also loaded into the pairwise storage schema + c = new_client() -for ln in open(os.path.join(SRCDIR,"load.sql")): - c.stdin.write(ln.replace('PWD', SRCDIR)) + +# get files +files_path = os.path.join(SRCDIR, "files") +files_qname_path = os.path.join(files_path, "queryname") +files = [ f for f in os.listdir(files_path) if os.path.isfile(os.path.join(files_path, f)) ] +files_qname = [ f for f in os.listdir(files_qname_path) if os.path.isfile(os.path.join(files_qname_path, f)) ] + +# use bam_loader_repos +c.stdin.write("CALL bam_loader_repos('%s', 0, 8);\n"% files_path) +c.stdin.write("CALL bam_loader_repos('%s', 1, 8);\n"% files_qname_path) + +# use bam_loader_files by first writing temporary files and then invoking these loaders +f_tmp_path = os.path.join(SRCDIR, "tmp_list.txt") +f_tmp_qname_path = os.path.join(SRCDIR, "tmp_list_qname.txt") +f_tmp = open(f_tmp_path, "w") +f_tmp_qname = open(f_tmp_qname_path, "w") + +for file in files: + f_tmp.write("%s\n"% os.path.join(files_path, file)) + +for file in files_qname: + f_tmp_qname.write("%s\n"% os.path.join(files_qname_path, file)) + +f_tmp.close() +f_tmp_qname.close() + +c.stdin.write("CALL bam_loader_files('%s', 0, 8);\n"% f_tmp_path) +c.stdin.write("CALL bam_loader_files('%s', 1, 8);\n"% f_tmp_qname_path) + +# use bam_loader_file on all separate files +for file in files: + c.stdin.write("CALL bam_loader_file('%s', 0);\n"% os.path.join(files_path, file)) + +for file in files_qname: + c.stdin.write("CALL bam_loader_file('%s', 1);\n"% os.path.join(files_qname_path, file)) out, err = c.communicate() sys.stdout.write(out) sys.stderr.write(err) +# clean up the temporary files +os.remove(f_tmp_path) +os.remove(f_tmp_qname_path) -# now retrieve the file ids that have been inserted -# (should be 1,2,..,nr_files, but still we extract it from the -# db to be flexible to logical changes in the bam loader as -# much as possible) + +# extract the files that have been inserted into the database +# gives us a list of (file_id, dbschema) tuples c = new_client() -c.stdin.write("SELECT file_id, dbschema FROM bam.files;") +c.stdin.write("SELECT file_id, dbschema FROM bam.files;\n") out, err = c.communicate() -p = re.compile('^\s*\[\s*(\d)\s*,\s*(\d)\s*\]\s*$', re.MULTILINE) -files_to_test = [] +sys.stdout.write(out) +sys.stderr.write(err) + +p = re.compile('^\s*\[\s*(\d+)\s*,\s*(\d)\s*\]\s*$', re.MULTILINE) # Parses raw DB output +inserted = [] for match in p.finditer(out): - files_to_test.append((int(match.group(1)), int(match.group(2)))) + inserted.append((int(match.group(1)), int(match.group(2)))) -# Now we will execute all benchmark queries on all BAM files in the bam.files table -# and output all data contained in the aux table. -# Furthermore, we transfer every file to the export table, use sam_export to write -# the contents to a SAM file and then print the raw contents of this SAM file + + +# now we will test the drop functionality by removing all duplicate bam/sam files from the database +# we know that only the first len(files) + len(files_qname) files are unique, so we remove +# everything with a file id higher than len(files) + len(files_qname) +to_query = [] # to_query will be filled with the files that will be used to run queries on +c = new_client() +for (file_id, dbschema) in inserted: + if file_id > len(files) + len(files_qname): + c.stdin.write("CALL bam_drop_file(%d, %d);\n"% (file_id, dbschema)) + else: + to_query.append((file_id, dbschema)) + +# and just to check, count the number of bam files +c.stdin.write("SELECT COUNT(*) FROM bam.files;\n") + +out, err = c.communicate() +sys.stdout.write(out) +sys.stderr.write(err) + + + + + +# Now we will execute all benchmark queries on all SAM/BAM files in the to_query list +# Furthermore, we will load every file entirely into the export table and then export +# it to both the SAM and the BAM file formats by using sam_export and bam_export +# respectively. + +# Note: some things are commented out, since the bam_export functionality is not available yet + +def tmp_output(file_id, sam=True): + return os.path.join(SRCDIR, "output_%d.%s"% (file_id, "sam" if sam else "bam")) + +def projection(file_id, dbschema): + return "SELECT qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual FROM bam.%salignments_%d"\ + % (('unpaired_all_' if dbschema == 1 else ''), file_id) + c = new_client() set_var(c, 'rname_1_3', 'chr22', numeric=False) set_var(c, 'pos_1_3_1', 1000000) @@ -58,48 +139,71 @@ set_var(c, 'rname_2_10', 'chr22', numeri set_var(c, 'pos_2_10', 80000000) set_var(c, 'distance_2_12', 10000000) -output_files = [] -for f in files_to_test: +# Determine the next file_id that will be assigned to a file that is loaded into the database +next_file_id = 1 +for (file_id, dbschema) in inserted: + next_file_id = max(next_file_id, file_id+1) + +for (file_id, dbschema) in to_query: # benchmark 1 for uc in range(1, 6): - for ln in open(os.path.join(SRCDIR, 'benchmarks_%s/query1.%d.sql'% (f[1], uc))): - c.stdin.write(ln.replace('alignments_i', 'alignments_%d'% f[0])) + for ln in open(os.path.join(SRCDIR, 'benchmarks_%s/query1.%d.sql'% (dbschema, uc))): + c.stdin.write(ln.replace('alignments_i', 'alignments_%d'% file_id)) - #benchmark 2 + # benchmark 2 for uc in range(1, 13): - for ln in open(os.path.join(SRCDIR, 'benchmarks_%s/query2.%d.sql'% (f[1], uc))): - c.stdin.write(ln.replace('alignments_i', 'alignments_%d'% f[0]) - .replace('alignments_j', 'alignments_%d'% f[0])) + for ln in open(os.path.join(SRCDIR, 'benchmarks_%s/query2.%d.sql'% (dbschema, uc))): + c.stdin.write(ln.replace('alignments_i', 'alignments_%d'% file_id) + .replace('alignments_j', 'alignments_%d'% file_id)) - #write all aux data - c.stdin.write("SELECT * FROM bam.alignments_extra_%d;"% f[0]); + # load this file into export table (export table is always cleared after exporting, so it should be empty here) + c.stdin.write("INSERT INTO bam.export (%s);\n"% projection(file_id, dbschema)) - #load into export table - c.stdin.write("INSERT INTO bam.export (SELECT qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, seq, qual FROM bam.%salignments_%d);"\ - % (('unpaired_all_' if f[1] == 1 else ''), f[0])) - output_files.append(os.path.join(SRCDIR, ("output_%d.sam"% f[0]))) - c.stdin.write("CALL sam_export('%s');"% output_files[-1]) + # write it to SAM file + c.stdin.write("CALL sam_export('%s');\n"% tmp_output(file_id)) + # insert it again into the export table + c.stdin.write("INSERT INTO bam.export (%s);\n"% projection(file_id, dbschema)) + # and write it to BAM file (will throw exception)) + c.stdin.write("CALL bam_export('%s');\n"% tmp_output(file_id, sam=False)) + # Now load SAM (and BAM) files back into the database + sam_id = next_file_id + # bamid = next_file_id + 1 + # next_file_id += 2 + next_file_id += 1 -out, err = c.communicate() + # Insert both the SAM and BAM files into the database. Note that we always insert into + # dbschema 0, since the outputted files aren't necessarily ordered by qname. + c.stdin.write("CALL bam_loader_file('%s', 0);\n"% tmp_output(file_id)) + c.stdin.write("CALL bam_loader_file('%s', 0);\n"% tmp_output(file_id, sam=False)) + + # We now have the alignment data for this file in three different alignment tables. The ultimate + # export/import test is now to see if the data in them is exactly the same (except for virtual_offset) + # by taking the except (should be empty of course) + c.stdin.write("%s\nEXCEPT\n%s;\n"% (projection(file_id, dbschema), projection(sam_id, 0))) + #c.stdin.write("%s EXCEPT %s";\n% (projection(file_id, dbschema), projection(bam_id, 0))) + # The output will contain explicit file_ids in the table names, making the table names # variable. We don't want a test to fail on this, so we have to remove te file # ids from the table names. -def replace(matchobj): - return 'alignments_%si'% (matchobj.group(1) if matchobj.group(1) else '') -p = re.compile('alignments_(extra_)?\d+') -out = (re.subn(p, replace, out))[0] +# Note: Has been commented out, since file ids should always be the same, since we always +# start out with an empty database + +#def replace(matchobj): +# return 'alignments_%si'% (matchobj.group(1) if matchobj.group(1) else '') +#p = re.compile('alignments_(extra_)?\d+') +#out = (re.subn(p, replace, out))[0] + +out, err = c.communicate() sys.stdout.write(out) sys.stderr.write(err) -# All that is left to do is write the contents of the exported SAM files and delete them -for f in output_files: - sys.stdout.write("\n\nContents of exported SAM file '%s':\n"% f) - for ln in open(f): - sys.stdout.write(ln) - os.remove(f) \ No newline at end of file +# All that is left to do is delete the temporary SAM/BAM files +for (file_id, dbschema) in to_query: + os.remove(tmp_output(file_id)) + os.remove(tmp_output(file_id, sam=False)) \ No newline at end of file diff --git a/sql/backends/monet5/bam/Tests/test.stable.err b/sql/backends/monet5/bam/Tests/test.stable.err --- a/sql/backends/monet5/bam/Tests/test.stable.err +++ b/sql/backends/monet5/bam/Tests/test.stable.err @@ -1,11 +1,11 @@ stderr of test 'test` in directory 'sql/backends/monet5/bam` itself: -# 20:58:07 > -# 20:58:07 > "mserver5" "--debug=10" "--set" "gdk_nr_threads=0" "--set" "mapi_open=true" "--set" "mapi_port=32253" "--set" "mapi_usock=/var/tmp/mtest-29485/.s.monetdb.32253" "--set" "monet_prompt=" "--forcemito" "--set" "mal_listing=2" "--dbpath=/export/scratch2/robin/Monet/PREFIX/DFT/var/MonetDB/mTests_sql_backends_monet5_bam" "--set" "mal_listing=0" -# 20:58:07 > +# 16:03:13 > +# 16:03:13 > "mserver5" "--debug=10" "--set" "gdk_nr_threads=0" "--set" "mapi_open=true" "--set" "mapi_port=31808" "--set" "mapi_usock=/var/tmp/mtest-26892/.s.monetdb.31808" "--set" "monet_prompt=" "--forcemito" "--set" "mal_listing=2" "--dbpath=/home/robin/MonetDB/PREFIX/var/MonetDB/mTests_sql_backends_monet5_bam" "--set" "mal_listing=0" +# 16:03:13 > -# builtin opt gdk_dbpath = /export/scratch2/robin/Monet/PREFIX/DFT/var/monetdb5/dbfarm/demo +# builtin opt gdk_dbpath = /home/robin/MonetDB/PREFIX/var/monetdb5/dbfarm/demo # builtin opt gdk_debug = 0 # builtin opt gdk_vmtrim = no # builtin opt monet_prompt = > @@ -17,11 +17,11 @@ stderr of test 'test` in directory 'sql/ # builtin opt sql_debug = 0 # cmdline opt gdk_nr_threads = 0 # cmdline opt mapi_open = true -# cmdline opt mapi_port = 32253 -# cmdline opt mapi_usock = /var/tmp/mtest-29485/.s.monetdb.32253 +# cmdline opt mapi_port = 31808 +# cmdline opt mapi_usock = /var/tmp/mtest-26892/.s.monetdb.31808 # cmdline opt monet_prompt = # cmdline opt mal_listing = 2 -# cmdline opt gdk_dbpath = /export/scratch2/robin/Monet/PREFIX/DFT/var/MonetDB/mTests_sql_backends_monet5_bam +# cmdline opt gdk_dbpath = /home/robin/MonetDB/PREFIX/var/MonetDB/mTests_sql_backends_monet5_bam # cmdline opt mal_listing = 0 # cmdline opt gdk_debug = 536870922 # <bam_loader>: Loader started for 2 BAM files... @@ -29,533 +29,1568 @@ stderr of test 'test` in directory 'sql/ # # CREATE SCHEMA bam; # # <bam_loader> Creating table 'files'... # # CREATE TABLE bam.files ( -# file_id BIGINT NOT NULL, -# file_location STRING NOT NULL, -# dbschema SMALLINT NOT NULL, -# format_version VARCHAR(7), -# sorting_order VARCHAR(10), -# comments STRING, -# CONSTRAINT files_pkey_file_id PRIMARY KEY (file_id) -# ); +# file_id BIGINT NOT NULL, +# file_location STRING NOT NULL, +# dbschema SMALLINT NOT NULL, +# format_version VARCHAR(7), +# sorting_order VARCHAR(10), _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list