MonetDB: bamloader - Improved test greatly

Robin Cijvat Thu, 03 Jul 2014 07:24:12 -0700

Changeset: e1dd2e16216e for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e1dd2e16216e
Removed Files:
        sql/backends/monet5/bam/Tests/load.sql
Modified Files:
        sql/backends/monet5/bam/Tests/test.SQL.py
        sql/backends/monet5/bam/Tests/test.stable.err
        sql/backends/monet5/bam/Tests/test.stable.out
Branch: bamloader
Log Message:


Improved test greatly


diffs (truncated from 4768 to 300 lines):

diff --git a/sql/backends/monet5/bam/Tests/load.sql 
b/sql/backends/monet5/bam/Tests/load.sql
deleted file mode 100644
--- a/sql/backends/monet5/bam/Tests/load.sql
+++ /dev/null
@@ -1,2 +0,0 @@
-CALL bam_loader_repos('PWD/files', 0, 32);
-CALL bam_loader_repos('PWD/files/queryname', 1, 32);
diff --git a/sql/backends/monet5/bam/Tests/test.SQL.py 
b/sql/backends/monet5/bam/Tests/test.SQL.py
--- a/sql/backends/monet5/bam/Tests/test.SQL.py
+++ b/sql/backends/monet5/bam/Tests/test.SQL.py
@@ -13,38 +13,119 @@ def new_client():
 
 def set_var(c, name, value, numeric=True):
     if numeric:
-        c.stdin.write('DECLARE %s integer;'% name)
-        c.stdin.write('SET %s=%s;'%(name, value))
+        c.stdin.write('DECLARE %s integer;\n'% name)
+        c.stdin.write('SET %s=%s;\n'%(name, value))
     else:
-        c.stdin.write('DECLARE %s varchar(32);'% name)
-        c.stdin.write('SET %s=\'%s\';'% (name, value))
+        c.stdin.write('DECLARE %s varchar(32);\n'% name)
+        c.stdin.write('SET %s=\'%s\';\n'% (name, value))
 
-# start with loading bam files into the database
+
+
+
+
+# start with loading bam files into the database with all loader functions we 
have built
+# we assume that all bam or sam files are stored into the files directory.
+# these files will all be loaded into the database with all loading methods 
that we have.
+# files in the directory files/queryname are assumed to be ordered by 
queryname and are
+# therefore also loaded into the pairwise storage schema
+
 c = new_client()
-for ln in open(os.path.join(SRCDIR,"load.sql")):
-    c.stdin.write(ln.replace('PWD', SRCDIR))
+
+# get files
+files_path = os.path.join(SRCDIR, "files")
+files_qname_path = os.path.join(files_path, "queryname")
+files = [ f for f in os.listdir(files_path) if 
os.path.isfile(os.path.join(files_path, f)) ]
+files_qname = [ f for f in os.listdir(files_qname_path) if 
os.path.isfile(os.path.join(files_qname_path, f)) ]
+
+# use bam_loader_repos
+c.stdin.write("CALL bam_loader_repos('%s', 0, 8);\n"% files_path)
+c.stdin.write("CALL bam_loader_repos('%s', 1, 8);\n"% files_qname_path)
+
+# use bam_loader_files by first writing temporary files and then invoking 
these loaders
+f_tmp_path = os.path.join(SRCDIR, "tmp_list.txt")
+f_tmp_qname_path = os.path.join(SRCDIR, "tmp_list_qname.txt")
+f_tmp = open(f_tmp_path, "w")
+f_tmp_qname = open(f_tmp_qname_path, "w")
+
+for file in files:
+    f_tmp.write("%s\n"% os.path.join(files_path, file))
+
+for file in files_qname:
+    f_tmp_qname.write("%s\n"% os.path.join(files_qname_path, file))
+
+f_tmp.close()
+f_tmp_qname.close()
+
+c.stdin.write("CALL bam_loader_files('%s', 0, 8);\n"% f_tmp_path)
+c.stdin.write("CALL bam_loader_files('%s', 1, 8);\n"% f_tmp_qname_path)
+
+# use bam_loader_file on all separate files
+for file in files:
+    c.stdin.write("CALL bam_loader_file('%s', 0);\n"% os.path.join(files_path, 
file))
+
+for file in files_qname:
+    c.stdin.write("CALL bam_loader_file('%s', 1);\n"% 
os.path.join(files_qname_path, file))
 
 out, err = c.communicate()
 sys.stdout.write(out)
 sys.stderr.write(err)
 
+# clean up the temporary files
+os.remove(f_tmp_path)
+os.remove(f_tmp_qname_path)
 
-# now retrieve the file ids that have been inserted
-# (should be 1,2,..,nr_files, but still we extract it from the
-# db to be flexible to logical changes in the bam loader as
-# much as possible)
+
+# extract the files that have been inserted into the database
+# gives us a list of (file_id, dbschema) tuples
 c = new_client()
-c.stdin.write("SELECT file_id, dbschema FROM bam.files;")
+c.stdin.write("SELECT file_id, dbschema FROM bam.files;\n")
 out, err = c.communicate()
-p = re.compile('^\s*\[\s*(\d)\s*,\s*(\d)\s*\]\s*$', re.MULTILINE)
-files_to_test = []
+sys.stdout.write(out)
+sys.stderr.write(err)
+
+p = re.compile('^\s*\[\s*(\d+)\s*,\s*(\d)\s*\]\s*$', re.MULTILINE) # Parses 
raw DB output
+inserted = []
 for match in p.finditer(out):
-    files_to_test.append((int(match.group(1)), int(match.group(2))))
+    inserted.append((int(match.group(1)), int(match.group(2))))
 
-# Now we will execute all benchmark queries on all BAM files in the bam.files 
table
-# and output all data contained in the aux table.
-# Furthermore, we transfer every file to the export table, use sam_export to 
write
-# the contents to a SAM file and then print the raw contents of this SAM file
+
+
+# now we will test the drop functionality by removing all duplicate bam/sam 
files from the database
+# we know that only the first len(files) + len(files_qname) files are unique, 
so we remove
+# everything with a file id higher than len(files) + len(files_qname)
+to_query = [] # to_query will be filled with the files that will be used to 
run queries on
+c = new_client()
+for (file_id, dbschema) in inserted:
+    if file_id > len(files) + len(files_qname):
+        c.stdin.write("CALL bam_drop_file(%d, %d);\n"% (file_id, dbschema))
+    else:
+        to_query.append((file_id, dbschema))
+
+# and just to check, count the number of bam files
+c.stdin.write("SELECT COUNT(*) FROM bam.files;\n")
+
+out, err = c.communicate()
+sys.stdout.write(out)
+sys.stderr.write(err)
+
+
+
+
+
+# Now we will execute all benchmark queries on all SAM/BAM files in the 
to_query list
+# Furthermore, we will load every file entirely into the export table and then 
export
+# it to both the SAM and the BAM file formats by using sam_export and 
bam_export
+# respectively.
+
+# Note: some things are commented out, since the bam_export functionality is 
not available yet
+
+def tmp_output(file_id, sam=True):
+    return os.path.join(SRCDIR, "output_%d.%s"% (file_id, "sam" if sam else 
"bam"))
+
+def projection(file_id, dbschema):
+    return "SELECT qname, flag, rname, pos, mapq, cigar, rnext, pnext, tlen, 
seq, qual FROM bam.%salignments_%d"\
+        % (('unpaired_all_' if dbschema == 1 else ''), file_id)
+
 c = new_client()
 set_var(c, 'rname_1_3', 'chr22', numeric=False)
 set_var(c, 'pos_1_3_1', 1000000)
@@ -58,48 +139,71 @@ set_var(c, 'rname_2_10', 'chr22', numeri
 set_var(c, 'pos_2_10', 80000000)
 set_var(c, 'distance_2_12', 10000000)
 
-output_files = []
-for f in files_to_test:
+# Determine the next file_id that will be assigned to a file that is loaded 
into the database
+next_file_id = 1
+for (file_id, dbschema) in inserted:
+    next_file_id = max(next_file_id, file_id+1)
+
+for (file_id, dbschema) in to_query:
     # benchmark 1
     for uc in range(1, 6):
-        for ln in open(os.path.join(SRCDIR, 'benchmarks_%s/query1.%d.sql'% 
(f[1], uc))):
-            c.stdin.write(ln.replace('alignments_i', 'alignments_%d'% f[0]))
+        for ln in open(os.path.join(SRCDIR, 'benchmarks_%s/query1.%d.sql'% 
(dbschema, uc))):
+            c.stdin.write(ln.replace('alignments_i', 'alignments_%d'% file_id))
 
-    #benchmark 2
+    # benchmark 2
     for uc in range(1, 13):
-        for ln in open(os.path.join(SRCDIR, 'benchmarks_%s/query2.%d.sql'% 
(f[1], uc))):
-            c.stdin.write(ln.replace('alignments_i', 'alignments_%d'% f[0])
-                            .replace('alignments_j', 'alignments_%d'% f[0]))
+        for ln in open(os.path.join(SRCDIR, 'benchmarks_%s/query2.%d.sql'% 
(dbschema, uc))):
+            c.stdin.write(ln.replace('alignments_i', 'alignments_%d'% file_id)
+                            .replace('alignments_j', 'alignments_%d'% file_id))
 
-    #write all aux data
-    c.stdin.write("SELECT * FROM bam.alignments_extra_%d;"% f[0]);
+    # load this file into export table (export table is always cleared after 
exporting, so it should be empty here)
+    c.stdin.write("INSERT INTO bam.export (%s);\n"% projection(file_id, 
dbschema))
 
-    #load into export table
-    c.stdin.write("INSERT INTO bam.export (SELECT qname, flag, rname, pos, 
mapq, cigar, rnext, pnext, tlen, seq, qual FROM bam.%salignments_%d);"\
-        % (('unpaired_all_' if f[1] == 1 else ''), f[0]))
-    output_files.append(os.path.join(SRCDIR, ("output_%d.sam"% f[0])))
-    c.stdin.write("CALL sam_export('%s');"% output_files[-1])
+    # write it to SAM file
+    c.stdin.write("CALL sam_export('%s');\n"% tmp_output(file_id))
 
+    # insert it again into the export table
+    c.stdin.write("INSERT INTO bam.export (%s);\n"% projection(file_id, 
dbschema))
 
+    # and write it to BAM file (will throw exception))
+    c.stdin.write("CALL bam_export('%s');\n"% tmp_output(file_id, sam=False))
 
+    # Now load SAM (and BAM) files back into the database
+    sam_id = next_file_id
+    # bamid = next_file_id + 1
+    # next_file_id += 2
+    next_file_id += 1
 
-out, err = c.communicate()
+    # Insert both the SAM and BAM files into the database. Note that we always 
insert into
+    # dbschema 0, since the outputted files aren't necessarily ordered by 
qname.
+    c.stdin.write("CALL bam_loader_file('%s', 0);\n"% tmp_output(file_id))
+    c.stdin.write("CALL bam_loader_file('%s', 0);\n"% tmp_output(file_id, 
sam=False))
+
+    # We now have the alignment data for this file in three different 
alignment tables. The ultimate
+    # export/import test is now to see if the data in them is exactly the same 
(except for virtual_offset)
+    # by taking the except (should be empty of course)
+    c.stdin.write("%s\nEXCEPT\n%s;\n"% (projection(file_id, dbschema), 
projection(sam_id, 0)))
+    #c.stdin.write("%s EXCEPT %s";\n% (projection(file_id, dbschema), 
projection(bam_id, 0)))
+
 
 # The output will contain explicit file_ids in the table names, making the 
table names
 # variable. We don't want a test to fail on this, so we have to remove te file
 # ids from the table names.
-def replace(matchobj):
-    return 'alignments_%si'% (matchobj.group(1) if matchobj.group(1) else '')
-p = re.compile('alignments_(extra_)?\d+')
-out = (re.subn(p, replace, out))[0]
 
+# Note: Has been commented out, since file ids should always be the same, 
since we always
+# start out with an empty database
+
+#def replace(matchobj):
+#    return 'alignments_%si'% (matchobj.group(1) if matchobj.group(1) else '')
+#p = re.compile('alignments_(extra_)?\d+')
+#out = (re.subn(p, replace, out))[0]
+
+out, err = c.communicate()
 sys.stdout.write(out)
 sys.stderr.write(err)
 
 
-# All that is left to do is write the contents of the exported SAM files and 
delete them
-for f in output_files:
-    sys.stdout.write("\n\nContents of exported SAM file '%s':\n"% f)
-    for ln in open(f):
-        sys.stdout.write(ln)
-    os.remove(f)
\ No newline at end of file
+# All that is left to do is delete the temporary SAM/BAM files
+for (file_id, dbschema) in to_query:
+    os.remove(tmp_output(file_id))
+    os.remove(tmp_output(file_id, sam=False))
\ No newline at end of file
diff --git a/sql/backends/monet5/bam/Tests/test.stable.err 
b/sql/backends/monet5/bam/Tests/test.stable.err
--- a/sql/backends/monet5/bam/Tests/test.stable.err
+++ b/sql/backends/monet5/bam/Tests/test.stable.err
@@ -1,11 +1,11 @@
 stderr of test 'test` in directory 'sql/backends/monet5/bam` itself:
 
 
-# 20:58:07 >  
-# 20:58:07 >  "mserver5" "--debug=10" "--set" "gdk_nr_threads=0" "--set" 
"mapi_open=true" "--set" "mapi_port=32253" "--set" 
"mapi_usock=/var/tmp/mtest-29485/.s.monetdb.32253" "--set" "monet_prompt=" 
"--forcemito" "--set" "mal_listing=2" 
"--dbpath=/export/scratch2/robin/Monet/PREFIX/DFT/var/MonetDB/mTests_sql_backends_monet5_bam"
 "--set" "mal_listing=0"
-# 20:58:07 >  
+# 16:03:13 >  
+# 16:03:13 >  "mserver5" "--debug=10" "--set" "gdk_nr_threads=0" "--set" 
"mapi_open=true" "--set" "mapi_port=31808" "--set" 
"mapi_usock=/var/tmp/mtest-26892/.s.monetdb.31808" "--set" "monet_prompt=" 
"--forcemito" "--set" "mal_listing=2" 
"--dbpath=/home/robin/MonetDB/PREFIX/var/MonetDB/mTests_sql_backends_monet5_bam"
 "--set" "mal_listing=0"
+# 16:03:13 >  
 
-# builtin opt  gdk_dbpath = 
/export/scratch2/robin/Monet/PREFIX/DFT/var/monetdb5/dbfarm/demo
+# builtin opt  gdk_dbpath = /home/robin/MonetDB/PREFIX/var/monetdb5/dbfarm/demo
 # builtin opt  gdk_debug = 0
 # builtin opt  gdk_vmtrim = no
 # builtin opt  monet_prompt = >
@@ -17,11 +17,11 @@ stderr of test 'test` in directory 'sql/
 # builtin opt  sql_debug = 0
 # cmdline opt  gdk_nr_threads = 0
 # cmdline opt  mapi_open = true
-# cmdline opt  mapi_port = 32253
-# cmdline opt  mapi_usock = /var/tmp/mtest-29485/.s.monetdb.32253
+# cmdline opt  mapi_port = 31808
+# cmdline opt  mapi_usock = /var/tmp/mtest-26892/.s.monetdb.31808
 # cmdline opt  monet_prompt = 
 # cmdline opt  mal_listing = 2
-# cmdline opt  gdk_dbpath = 
/export/scratch2/robin/Monet/PREFIX/DFT/var/MonetDB/mTests_sql_backends_monet5_bam
+# cmdline opt  gdk_dbpath = 
/home/robin/MonetDB/PREFIX/var/MonetDB/mTests_sql_backends_monet5_bam
 # cmdline opt  mal_listing = 0
 # cmdline opt  gdk_debug = 536870922
 # <bam_loader>: Loader started for 2 BAM files...
@@ -29,533 +29,1568 @@ stderr of test 'test` in directory 'sql/
 # # CREATE SCHEMA bam;
 # # <bam_loader> Creating table 'files'...
 # # CREATE TABLE bam.files ( 
-#         file_id                        BIGINT      NOT NULL, 
-#         file_location                  STRING      NOT NULL, 
-#         dbschema                       SMALLINT    NOT NULL, 
-#         format_version                 VARCHAR(7), 
-#         sorting_order                  VARCHAR(10), 
-#         comments                       STRING, 
-#         CONSTRAINT files_pkey_file_id PRIMARY KEY (file_id) 
-#     );
+#              file_id                                         BIGINT    NOT 
NULL, 
+#              file_location                             STRING          NOT 
NULL, 
+#              dbschema                                           SMALLINT     
NOT NULL, 
+#              format_version                           VARCHAR(7), 
+#              sorting_order                             VARCHAR(10), 
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

MonetDB: bamloader - Improved test greatly

Reply via email to