Changeset: 872efa4d660f for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=872efa4d660f Added Files: sql/backends/monet5/bam/Tests/benchmark.sql Modified Files: sql/backends/monet5/bam/85_bam.sql sql/backends/monet5/bam/bam.mal sql/backends/monet5/bam/bam_lib.c sql/backends/monet5/bam/bam_lib.h sql/backends/monet5/bam/bam_loader.c sql/backends/monet5/bam/bam_schema_1.sql Branch: DVframework_bam Log Message:
Fixed a segmentation fault that occurred when calculating new file_id when database is empty. Extended and corrected bam_lib. Added benchmarks. SQL queries make use of the UDFs in bam_lib. Extensive explanation of the queries is also added. The queries will be tested soon. Some of them are known to crash mclient. H: segmentation fault that occurred when calculating new file_id when database is empty. Extended and corrected bam_lib. Enter commit message. Lines beginning with 'HG:' are removed. diffs (truncated from 473 to 300 lines): diff --git a/sql/backends/monet5/bam/85_bam.sql b/sql/backends/monet5/bam/85_bam.sql --- a/sql/backends/monet5/bam/85_bam.sql +++ b/sql/backends/monet5/bam/85_bam.sql @@ -2,11 +2,14 @@ CREATE PROCEDURE bam_loader(repo STRING, EXTERNAL NAME bam.bam_loader; -CREATE FUNCTION bam_flag(flag INT, name STRING) -RETURNS STRING EXTERNAL NAME bam.bam_flag; +CREATE FUNCTION bam_flag(flag SMALLINT, name STRING) +RETURNS BOOLEAN EXTERNAL NAME bam.bam_flag; CREATE FUNCTION reverse_seq(seq STRING) RETURNS STRING EXTERNAL NAME bam.reverse_seq; CREATE FUNCTION reverse_qual(qual STRING) RETURNS STRING EXTERNAL NAME bam.reverse_qual; + +CREATE FUNCTION seq_length(cigar STRING) +RETURNS INT EXTERNAL NAME bam.seq_length; \ No newline at end of file diff --git a/sql/backends/monet5/bam/Tests/benchmark.sql b/sql/backends/monet5/bam/Tests/benchmark.sql new file mode 100644 --- /dev/null +++ b/sql/backends/monet5/bam/Tests/benchmark.sql @@ -0,0 +1,306 @@ +-------------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------- Query 1 ---------------------------------------------------------------- +-------------------------------------------------------------------------------------------------------------------------------------- + +SELECT qname, + CASE WHEN bam_flag("flag", 'segm_reve') THEN reverse_seq("seq") ELSE "seq" END, + CASE WHEN bam_flag("flag", 'segm_reve') THEN reverse_qual("qual") ELSE "qual" END +FROM bam.alignments +WHERE file_id = 2 + AND bam_flag("flag", 'seco_alig') = FALSE + AND bam_flag("flag", 'segm_unma') = TRUE + AND qname IN ( + SELECT qname + FROM bam.alignments + WHERE bam_flag("flag", 'seco_alig') = FALSE + GROUP BY qname + HAVING COUNT(*) = 2 + AND SUM(bam_flag("flag", 'firs_segm')) = 1 + AND SUM(bam_flag("flag", 'last_segm')) = 1 + ) +ORDER BY qname, bam_flag("flag", 'last_segm'); + +-- Description: +-- This query selects fields required by the FASTQ file format (qname, seq/seq-reverse, qual/qual-reverse). +-- The result of this query has the following properties: +-- * It is calculated over a single BAM file +-- * It only considers primary alignments +-- * It only considers unmapped alignments +-- * The subquery selects only those qnames that have exactly two primary alignments: one left and one right alignment +-- * It is ordered by qname and then by the last_segm flag. The subquery imposes the invariant on the outer query that +-- only qnames are selected that have exactly one left primary read and one right primary read. The result of this +-- order is then that alignments of the same read are stored beneath eachother; the left segment first, and then the +-- right segment. + + + + + + + + + + +-------------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------- Query 2 ---------------------------------------------------------------- +-------------------------------------------------------------------------------------------------------------------------------------- + +SELECT a2.pos - (a1.pos + seq_length(a1.cigar)) AS distance, COUNT(*) AS nr_alignments +FROM ( + SELECT * + FROM bam.alignments + WHERE file_id = 1 + AND bam_flag(flag, 'seco_alig') = False + AND bam_flag(flag, 'firs_segm') = True +) AS a1 JOIN ( + SELECT * + FROM bam.alignments + WHERE file_id = 1 + AND bam_flag(flag, 'seco_alig') = False + AND bam_flag(flag, 'last_segm') = True +) AS a2 ON a1.qname = a2.qname +GROUP BY distance; + +-- Description: +-- This query calculates a histogram that displays the number of read pairs with a certain distance. +-- The outer query joins two subresults together. These subresults contain all primary alignments for +-- the first segment and the last segment respectively. If the query is consistent according to the +-- consistency check in query 4, there is a unique primary alignment for both sides (firs_segm and last_segm). +-- In the case there is a primary alignment in just one of the two subresults, the join will drop this +-- since the join predicate won't be fulfilled. +-- The distance of each record in this joined table can now be calculated. Furthermore, grouping can be +-- done on this distance to create the histogram. + + + + + + + + + +-------------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------- Query 3 ---------------------------------------------------------------- +-------------------------------------------------------------------------------------------------------------------------------------- + +SELECT * +FROM bam.alignments +WHERE qname IN ( + SELECT qname + FROM bam.alignments + WHERE file_id = 1 + GROUP BY qname + HAVING SUM(bam_flag(flag, 'firs_segm')) = 0 + OR SUM(bam_flag(flag, 'last_segm')) = 0 +) +ORDER BY qname; + +-- Description: +-- This query checks a consistency aspect of the BAM file with file_id=1. It returns all alignments +-- for qnames that have either no alignment flagged as first segment or no alignment flagged as last segment. +-- The inner query groups by qname and selects the inconsistent ones. The outer query then selects all attributes +-- from all alignments for these qnames. +-- To provide the user with a convenient overview of the inconsistencies, the result is ordered by qname. + + + + + + + + + +-------------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------- Query 4 ---------------------------------------------------------------- +-------------------------------------------------------------------------------------------------------------------------------------- + +SELECT * +FROM bam.alignments +WHERE qname IN ( + SELECT qname, + SUM(bam_flag(flag, 'firs_segm')) AS sum_firs_segm, + SUM(bam_flag(flag, 'last_segm')) AS sum_last_segm, + COUNT(*) - SUM(bam_flag(flag, 'seco_alig')) AS sum_prim_alig + FROM bam.alignments + WHERE file_id=1 + AND bam_flag(flag, 'segm_unma') = False + GROUP BY qname + HAVING (sum_firs_segm > 0 AND sum_last_segm > 0 AND sum_prim_alig <> 2) + OR (sum_firs_segm > 0 AND sum_prim_alig <> 1) + OR (sum_last_segm > 0 AND sum_prim_alig <> 1) +) +ORDER BY qname; + +-- Description: +-- This query also checks a consistency aspect of the BAM file with file_id=1. Every qname consists of two reads. +-- For both reads it must hold that all its alignments must either be unmapped or there must be exactly one primary +-- alignment. +-- In the inner query, the alignments with the unmapped flag set are thrown away directly. The alignments that +-- then remain are grouped by qname. A qname is then considered to be inconsistent if it fulfills one of these cases: +-- * There exists a left alignment and a right alignment in the group and the number of primary alignments <> 2 +-- * There exists a left alignment and no right alignment in the group and the number of primary alignments <> 1 +-- * There exists no left alignment and a right alignment in the group and the number of primary alignments <> 1 +-- Exactly these three cases can be seen in the HAVING clause of the inner query. +-- The query assumes that there won't exist alignments with flags firs_segm = last_segm = 0. +-- The outer query again selects all attributes of all alignments in the inconsistent qnames. +-- The result is again ordered by qname for convenient displaying. + + + + + + + + + + +-------------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------- Query 5 ---------------------------------------------------------------- +-------------------------------------------------------------------------------------------------------------------------------------- + +SELECT * +FROM alignments +WHERE qname IN ( + SELECT qname + FROM bam.alignments + WHERE file_id = 1 + INTERSECT + SELECT qname + FROM bam.alignments + WHERE file_id = 2 +); + +-- Description: +-- Does a set intersection on BAM files with file_id=1 and file_id=2, based on qname. + + + + + + + + + +-------------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------- Query 6 ---------------------------------------------------------------- +-------------------------------------------------------------------------------------------------------------------------------------- + +SELECT * +FROM alignments +WHERE qname IN ( + SELECT qname + FROM bam.alignments + WHERE file_id = 1 + EXCEPT + SELECT qname + FROM bam.alignments + WHERE file_id = 2 +); + +-- Description: +-- Does a set minus on BAM files with file_id=1 and file_id=2, based on qname. + + + + + + + + + + +-------------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------- Query 7 ---------------------------------------------------------------- +-------------------------------------------------------------------------------------------------------------------------------------- + +SELECT * +FROM ( + SELECT * + FROM + bam.alignments + WHERE file_id = 1 + AND bam_flag(flag, 'seco_alig') = FALSE +) AS a1 JOIN ( + SELECT * + FROM + bam.alignments + WHERE file_id = 2 + AND bam_flag(flag, 'seco_alig') = FALSE +) AS a2 + ON a1.qname = a2.qname + AND bam_flag(a1.flag, 'firs_segm') = bam_flag(a2.flag, 'firs_segm') + AND bam_flag(a1.flag, 'last_segm') = bam_flag(a2.flag, 'last_segm') + AND a1.pos <> a2.pos + +-- Description: +-- Joins primary alignments from two files (file_id=1 and file_id=2) together if these alignments have the same qname but +-- are mapped to different positions. +-- The first subquery selects all primary alignments from file with file_id=1 +-- The second subquery selects all primary alignments from file with file_id=2 +-- The outer query then joins two alignments from these two subresults together under the following conditions: +-- * The qnames of the alignments are equal +-- * The alignments are both either left or right alignments +-- * The positions of the alignments aren't equal + + + + + + + + + +-------------------------------------------------------------------------------------------------------------------------------------- +------------------------------------------------------------- Query 8 ---------------------------------------------------------------- +-------------------------------------------------------------------------------------------------------------------------------------- + +SELECT * +FROM alignments +WHERE file_id = 1 + AND rname = 'chr22' + AND 10 >= pos + AND 10 < pos + seq_length(cigar); + +-- Description: +-- Selects all alignments with file_id=1 that overlap position 10 in chromosome "chr22" + + + + + + + + + _______________________________________________ checkin-list mailing list checkin-list@monetdb.org http://mail.monetdb.org/mailman/listinfo/checkin-list