Changeset: 872efa4d660f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=872efa4d660f
Added Files:
        sql/backends/monet5/bam/Tests/benchmark.sql
Modified Files:
        sql/backends/monet5/bam/85_bam.sql
        sql/backends/monet5/bam/bam.mal
        sql/backends/monet5/bam/bam_lib.c
        sql/backends/monet5/bam/bam_lib.h
        sql/backends/monet5/bam/bam_loader.c
        sql/backends/monet5/bam/bam_schema_1.sql
Branch: DVframework_bam
Log Message:

Fixed a segmentation fault that occurred when calculating new file_id when 
database is empty.
Extended and corrected bam_lib.
Added benchmarks. SQL queries make use of the UDFs in bam_lib. Extensive 
explanation of the queries is also added. The queries will be tested soon. Some 
of them are known to crash mclient.



H: segmentation fault that occurred when calculating new file_id when database 
is empty. Extended and corrected bam_lib.
 Enter commit message.  Lines beginning with 'HG:' are removed.


diffs (truncated from 473 to 300 lines):

diff --git a/sql/backends/monet5/bam/85_bam.sql 
b/sql/backends/monet5/bam/85_bam.sql
--- a/sql/backends/monet5/bam/85_bam.sql
+++ b/sql/backends/monet5/bam/85_bam.sql
@@ -2,11 +2,14 @@ CREATE PROCEDURE bam_loader(repo STRING,
 EXTERNAL NAME bam.bam_loader;
 
 
-CREATE FUNCTION bam_flag(flag INT, name STRING)
-RETURNS STRING EXTERNAL NAME bam.bam_flag;
+CREATE FUNCTION bam_flag(flag SMALLINT, name STRING)
+RETURNS BOOLEAN EXTERNAL NAME bam.bam_flag;
 
 CREATE FUNCTION reverse_seq(seq STRING)
 RETURNS STRING EXTERNAL NAME bam.reverse_seq;
 
 CREATE FUNCTION reverse_qual(qual STRING)
 RETURNS STRING EXTERNAL NAME bam.reverse_qual;
+
+CREATE FUNCTION seq_length(cigar STRING)
+RETURNS INT EXTERNAL NAME bam.seq_length;
\ No newline at end of file
diff --git a/sql/backends/monet5/bam/Tests/benchmark.sql 
b/sql/backends/monet5/bam/Tests/benchmark.sql
new file mode 100644
--- /dev/null
+++ b/sql/backends/monet5/bam/Tests/benchmark.sql
@@ -0,0 +1,306 @@
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 1 
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT qname, 
+    CASE WHEN bam_flag("flag", 'segm_reve') THEN reverse_seq("seq")   ELSE 
"seq"  END,
+    CASE WHEN bam_flag("flag", 'segm_reve') THEN reverse_qual("qual") ELSE 
"qual" END
+FROM bam.alignments
+WHERE   file_id = 2
+    AND bam_flag("flag", 'seco_alig') = FALSE
+    AND bam_flag("flag", 'segm_unma') = TRUE
+    AND qname IN (
+        SELECT qname
+        FROM bam.alignments
+        WHERE bam_flag("flag", 'seco_alig') = FALSE
+        GROUP BY qname
+        HAVING COUNT(*) = 2
+           AND SUM(bam_flag("flag", 'firs_segm')) = 1
+           AND SUM(bam_flag("flag", 'last_segm')) = 1
+    )
+ORDER BY qname, bam_flag("flag", 'last_segm');
+
+-- Description:
+-- This query selects fields required by the FASTQ file format (qname, 
seq/seq-reverse, qual/qual-reverse).
+-- The result of this query has the following properties:
+-- * It is calculated over a single BAM file
+-- * It only considers primary alignments
+-- * It only considers unmapped alignments
+-- * The subquery selects only those qnames that have exactly two primary 
alignments: one left and one right alignment
+-- * It is ordered by qname and then by the last_segm flag. The subquery 
imposes the invariant on the outer query that 
+--   only qnames are selected that have exactly one left primary read and one 
right primary read. The result of this
+--   order is then that alignments of the same read are stored beneath 
eachother; the left segment first, and then the
+--   right segment.
+
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 2 
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT a2.pos - (a1.pos + seq_length(a1.cigar)) AS distance, COUNT(*) AS 
nr_alignments
+FROM (
+    SELECT *
+    FROM bam.alignments
+    WHERE file_id = 1
+      AND bam_flag(flag, 'seco_alig') = False
+      AND bam_flag(flag, 'firs_segm') = True
+) AS a1 JOIN (
+    SELECT *
+    FROM bam.alignments
+    WHERE file_id = 1
+      AND bam_flag(flag, 'seco_alig') = False
+      AND bam_flag(flag, 'last_segm') = True
+) AS a2 ON a1.qname = a2.qname
+GROUP BY distance;
+
+-- Description:
+-- This query calculates a histogram that displays the number of read pairs 
with a certain distance.
+-- The outer query joins two subresults together. These subresults contain all 
primary alignments for
+-- the first segment and the last segment respectively. If the query is 
consistent according to the 
+-- consistency check in query 4, there is a unique primary alignment for both 
sides (firs_segm and last_segm).
+-- In the case there is a primary alignment in just one of the two subresults, 
the join will drop this
+-- since the join predicate won't be fulfilled. 
+-- The distance of each record in this joined table can now be calculated. 
Furthermore, grouping can be 
+-- done on this distance to create the histogram.
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 3 
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM bam.alignments 
+WHERE qname IN (
+    SELECT qname
+    FROM bam.alignments
+    WHERE file_id = 1
+    GROUP BY qname
+    HAVING SUM(bam_flag(flag, 'firs_segm')) = 0 
+        OR SUM(bam_flag(flag, 'last_segm')) = 0
+)
+ORDER BY qname;
+
+-- Description:
+-- This query checks a consistency aspect of the BAM file with file_id=1. It 
returns all alignments
+-- for qnames that have either no alignment flagged as first segment or no 
alignment flagged as last segment.
+-- The inner query groups by qname and selects the inconsistent ones. The 
outer query then selects all attributes
+-- from all alignments for these qnames.
+-- To provide the user with a convenient overview of the inconsistencies, the 
result is ordered by qname.
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 4 
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM bam.alignments
+WHERE qname IN (
+    SELECT qname, 
+        SUM(bam_flag(flag, 'firs_segm')) AS sum_firs_segm, 
+        SUM(bam_flag(flag, 'last_segm')) AS sum_last_segm, 
+        COUNT(*) - SUM(bam_flag(flag, 'seco_alig')) AS sum_prim_alig
+    FROM bam.alignments
+    WHERE file_id=1
+      AND bam_flag(flag, 'segm_unma') = False
+    GROUP BY qname
+    HAVING (sum_firs_segm > 0 AND sum_last_segm > 0 AND sum_prim_alig <> 2)
+        OR (sum_firs_segm > 0 AND sum_prim_alig <> 1)
+        OR (sum_last_segm > 0 AND sum_prim_alig <> 1)
+)
+ORDER BY qname;
+
+-- Description:
+-- This query also checks a consistency aspect of the BAM file with file_id=1. 
Every qname consists of two reads.
+-- For both reads it must hold that all its alignments must either be unmapped 
or there must be exactly one primary
+-- alignment.
+-- In the inner query, the alignments with the unmapped flag set are thrown 
away directly. The alignments that 
+-- then remain are grouped by qname. A qname is then considered to be 
inconsistent if it fulfills one of these cases:
+-- * There exists a left alignment and a right alignment in the group and the 
number of primary alignments <> 2
+-- * There exists a left alignment and no right alignment in the group and the 
number of primary alignments <> 1
+-- * There exists no left alignment and a right alignment in the group and the 
number of primary alignments <> 1
+-- Exactly these three cases can be seen in the HAVING clause of the inner 
query.
+-- The query assumes that there won't exist alignments with flags firs_segm = 
last_segm = 0.
+-- The outer query again selects all attributes of all alignments in the 
inconsistent qnames. 
+-- The result is again ordered by qname for convenient displaying.
+
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 5 
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM alignments
+WHERE qname IN (
+    SELECT qname
+    FROM bam.alignments
+    WHERE file_id = 1
+    INTERSECT
+    SELECT qname 
+    FROM bam.alignments
+    WHERE file_id = 2
+);
+
+-- Description:
+-- Does a set intersection on BAM files with file_id=1 and file_id=2, based on 
qname.
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 6 
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM alignments
+WHERE qname IN (
+    SELECT qname
+    FROM bam.alignments
+    WHERE file_id = 1
+    EXCEPT
+    SELECT qname 
+    FROM bam.alignments
+    WHERE file_id = 2
+);
+
+-- Description:
+-- Does a set minus on BAM files with file_id=1 and file_id=2, based on qname.
+
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 7 
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM (
+    SELECT *
+    FROM
+    bam.alignments
+    WHERE file_id = 1
+      AND bam_flag(flag, 'seco_alig') = FALSE
+) AS a1 JOIN (
+    SELECT *
+    FROM
+    bam.alignments
+    WHERE file_id = 2
+      AND bam_flag(flag, 'seco_alig') = FALSE
+) AS a2 
+    ON  a1.qname = a2.qname
+    AND bam_flag(a1.flag, 'firs_segm') = bam_flag(a2.flag, 'firs_segm')
+    AND bam_flag(a1.flag, 'last_segm') = bam_flag(a2.flag, 'last_segm')
+    AND a1.pos <> a2.pos
+    
+-- Description:
+-- Joins primary alignments from two files (file_id=1 and file_id=2) together 
if these alignments have the same qname but
+-- are mapped to different positions. 
+-- The first subquery selects all primary alignments from file with file_id=1
+-- The second subquery selects all primary alignments from file with file_id=2
+-- The outer query then joins two alignments from these two subresults 
together under the following conditions:
+-- * The qnames of the alignments are equal
+-- * The alignments are both either left or right alignments
+-- * The positions of the alignments aren't equal
+
+
+
+
+
+
+
+
+
+--------------------------------------------------------------------------------------------------------------------------------------
+------------------------------------------------------------- Query 8 
----------------------------------------------------------------
+--------------------------------------------------------------------------------------------------------------------------------------
+
+SELECT *
+FROM alignments
+WHERE   file_id = 1
+    AND rname = 'chr22'
+    AND 10 >= pos
+    AND 10 < pos + seq_length(cigar);
+
+-- Description:
+-- Selects all alignments with file_id=1 that overlap position 10 in 
chromosome "chr22"
+
+
+
+
+
+
+
+
+
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to