Changeset: 19aee3b937e8 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=19aee3b937e8
Modified Files:
        sql/backends/monet5/bam/bam_loader.c
        sql/backends/monet5/bam/bam_wrapper.c
        sql/backends/monet5/bam/bam_wrapper.h
Branch: bamloader
Log Message:

Finished SAM loader functionality. Turned out to be a lot of work, since 
Samtools wouldn't cooperate. Therefore, a SAM file is now entirely parsed by 
this BAM library, without any help of Samtools. Test update will follow soon.


diffs (truncated from 1146 to 300 lines):

diff --git a/sql/backends/monet5/bam/bam_loader.c 
b/sql/backends/monet5/bam/bam_loader.c
--- a/sql/backends/monet5/bam/bam_loader.c
+++ b/sql/backends/monet5/bam/bam_loader.c
@@ -283,7 +283,7 @@ bam_loader(Client cntxt, MalBlkPtr mb, s
 
        /* Parse all headers */
        for (i = 0; i < nr_files; ++i) {
-               TO_LOG("<bam_loader> Parsing BAM header for file '%s'...\n",
+               TO_LOG("<bam_loader> Parsing header for file '%s'...\n",
                       filenames[i]);
                if ((msg = process_header(bws + i)) != MAL_SUCCEED) {
                        goto cleanup;
@@ -310,7 +310,7 @@ bam_loader(Client cntxt, MalBlkPtr mb, s
 
        /* Create alignment storage */
        for (i = 0; i < nr_files; ++i) {
-               TO_LOG("<bam_loader> Creating alignment tables for BAM file 
'%s'...\n", filenames[i]);
+               TO_LOG("<bam_loader> Creating alignment tables for file 
'%s'...\n", filenames[i]);
                if ((dbschema == 0
                     && create_alignment_storage_0(cntxt,
                                                   "bam.create_storage_0",
diff --git a/sql/backends/monet5/bam/bam_wrapper.c 
b/sql/backends/monet5/bam/bam_wrapper.c
--- a/sql/backends/monet5/bam/bam_wrapper.c
+++ b/sql/backends/monet5/bam/bam_wrapper.c
@@ -106,22 +106,63 @@ init_bam_wrapper(bam_wrapper * bw, filet
                          ERR_INIT_BAM_WRAPPER "BAM file could not be opened",
                          file_location);
            }
-           if ((bw->header = bam_header_read(bw->bam.input)) == NULL) {
+           if ((bw->bam.header = bam_header_read(bw->bam.input)) == NULL) {
                throw(MAL, "init_bam_wrapper",
                    ERR_INIT_BAM_WRAPPER "Unable to read header from file",
                    file_location);
         }
     } else {
         /* Open SAM file and read its header */
-        if ((bw->sam.input = samopen(file_location, "r", NULL)) == NULL) {
+        int bufsize = 4096;
+        lng header_len = 0;
+        if ((bw->sam.input = open_rastream(file_location)) == NULL) {
                    throw(MAL, "init_bam_wrapper",
                          ERR_INIT_BAM_WRAPPER "SAM file could not be opened",
                          file_location);
            }
-           if ((bw->header = bw->sam.input->header) == NULL) {
-               throw(MAL, "init_bam_wrapper",
-                   ERR_INIT_BAM_WRAPPER "Unable to read header from file",
-                   file_location);
+        if ((bw->sam.header = (str)GDKmalloc(bufsize * sizeof(char))) == NULL) 
{
+            throw(MAL, "init_bam_wrapper", 
+                  ERR_INIT_BAM_WRAPPER MAL_MALLOC_FAIL, file_location);
+        }
+        while (TRUE) {
+            int read = mnstr_readline(bw->sam.input, bw->sam.header + 
header_len, bufsize - header_len);
+            
+            if (read <= 0) {
+                throw(MAL, "init_bam_wrapper", 
+                      ERR_INIT_BAM_WRAPPER "Could not read line of SAM 
header", 
+                      file_location);
+            }
+            
+            if (bw->sam.header[header_len] != '@') {
+                /* This is not a header line, we assume that the header is 
finished.
+                 * Rewind stream to start of line and stop reading */
+                if (mnstr_fsetpos(bw->sam.input, header_len) < 0) {
+                    throw(MAL, "init_bam_wrapper", 
+                      ERR_INIT_BAM_WRAPPER "Could not read last line of SAM 
header", 
+                      file_location);
+                }
+                bw->sam.header[header_len] = '\0'; /* Truncate alignment data 
*/
+                break;
+            }
+            
+            if (bw->sam.header[header_len+read-1] != '\n') {
+                /* This line was not completed. Increase buffer size, rewind 
stream
+                 * and try again */
+                bufsize *= 2;
+                if ((bw->sam.header = (str)GDKrealloc(bw->sam.header, bufsize 
* sizeof(char))) == NULL) {
+                    throw(MAL, "init_bam_wrapper", 
+                          ERR_INIT_BAM_WRAPPER MAL_MALLOC_FAIL, file_location);
+                }
+                if (mnstr_fsetpos(bw->sam.input, header_len) < 0) {
+                    throw(MAL, "init_bam_wrapper", 
+                      ERR_INIT_BAM_WRAPPER "Could not read last line of SAM 
header", 
+                      file_location);
+                }
+                continue;
+            }
+            
+            /* Only if no special cases occured, we added the read characters 
to the header */
+            header_len += read;
         }
     }
     
@@ -326,15 +367,18 @@ clear_bam_wrapper(bam_wrapper * bw)
 
        /* Clear bam/sam specific fields */
        if (bw->type == BAM) {
-           if (bw->header) {
-                   bam_header_destroy(bw->header);
+           if (bw->bam.header) {
+                   bam_header_destroy(bw->bam.header);
            }
            if (bw->bam.input) {
                    bam_close(bw->bam.input);
            }
     } else {
         if (bw->sam.input) {
-            samclose(bw->sam.input);
+            close_stream(bw->sam.input);
+        }
+        if (bw->sam.header) {
+            GDKfree(bw->sam.header);
         }
     }
 
@@ -630,7 +674,7 @@ clear_bam_header_line(bam_header_line * 
        if (strcmp((opt).tag, (cmp)) == 0) {                            \
                (l) = strtol((opt).value, &s, 10);                      \
                if ((s) == ((opt).value) || (l) == LONG_MIN || (l) == LONG_MAX) 
\
-                       l = -1;                                         \
+                       l = -1;                                 \
                if (!APPEND_LNG(file, l)) {                             \
                        throw(MAL, "process_header", ERR_PROCESS_HEADER "Could 
not write option '%s:%s' to binary file", \
                              bw->file_location, (opt).tag, (opt).value); \
@@ -657,7 +701,7 @@ clear_bam_header_line(bam_header_line * 
 str
 process_header(bam_wrapper * bw)
 {
-       str header_str = bw->header->text;
+       str header_str;
 
        bam_header_line hl;
        int i, o;
@@ -674,6 +718,12 @@ process_header(bam_wrapper * bw)
        bit eof = FALSE;
        str s;
        lng l;
+    
+    if (bw->type == BAM) {
+        header_str = bw->bam.header->text;
+    } else {
+        header_str = bw->sam.header;
+    }
 
        hd_comment.l = hd_comment.m = 0;
        hd_comment.s = NULL;
@@ -1066,7 +1116,7 @@ process_header(bam_wrapper * bw)
 
 typedef struct alignment {
        lng virtual_offset;
-       char qname[256];        /* Can't be bigger than 256, since this is the 
max that Samtools can handle */
+       str qname;
        sht flag;
        str rname;
        int pos;
@@ -1077,33 +1127,68 @@ typedef struct alignment {
        int tlen;
        str seq;
        str qual;
+    
+    str aux; /* Only used when loading SAM */
 
-       int cigar_size;;
-       int seq_size;
+    int qname_size; /* Current buffer size of qname strs (Only used when type 
= SAM or dbschema = 1) */
+    int rname_size; /* Current buffer size of rname/rnext strs (only used when 
loading SAM) */
+       int cigar_size; /* Current buffer size of cigar str */
+       int seq_size; /* Current buffer size of seq and qual str */
+    int aux_size; /* Current buffer size of aux (only used when loading SAM) */
 
        bit written;
 } alignment;
 
 static bit
-init_alignment(alignment * alig)
+init_alignment(bam_wrapper *bw, alignment * alig)
 {
+    bit result;
+    
        /* Enables clear function to check variables */
        memset(alig, 0, sizeof(alignment));
 
-       alig->cigar_size = 1024;
-       alig->seq_size = 128;
+    alig->qname_size = 10000;
+    alig->rname_size = 10000;
+       alig->cigar_size = 10000;
+       alig->seq_size = 10000;
+    alig->aux_size = 10000;
 
        /* Dynamic buffers to be able to expand when necessary */
        alig->cigar = (str) GDKmalloc(alig->cigar_size * sizeof(char));
        alig->seq = (str) GDKmalloc(alig->seq_size * sizeof(char));
        alig->qual = (str) GDKmalloc(alig->seq_size * sizeof(char));
+    
+    result = (alig->cigar != NULL && alig->seq != NULL
+               && alig->qual != NULL);
+    
+    if (bw->type == SAM || bw->dbschema == 1) {
+        /* In this case we need to allocate space for the qname string 
+         * type == SAM   => qname not stored in bam1_t struct
+         * dbschema == 1 => Alignments need to be stored for more than one
+         *                  iteration
+         * In other cases, it remains NULL and it will point to the qname
+         * in the bam1_t struct 
+         */
+        alig->qname = (str) GDKmalloc(alig->qname_size * sizeof(char));
+    }
+    if (bw->type == SAM) {
+        /* If we are loading SAM, we need buffers for rname and rnext,
+         * since this is not taken care of by a bam_header_t dict now */
+        alig->rname = (str) GDKmalloc(alig->rname_size * sizeof(char));
+        alig->rnext = (str) GDKmalloc(alig->rname_size * sizeof(char));
 
-       return (alig->cigar != NULL && alig->seq != NULL
-               && alig->qual != NULL);
+        /* For SAM, we also need to have a buffer for aux, since this is
+         * not stored in a bam1_t struct anymore */
+         alig->aux = (str) GDKmalloc(alig->aux_size * sizeof(char));
+         result = (result && alig->rname != NULL &&
+                alig->rnext != NULL && alig->aux != NULL);
+    }
+
+       return result;
 }
 
 static void
-clear_alignment(alignment * alig)
+clear_alignment(bam_wrapper *bw, alignment * alig)
 {
        if (alig->cigar)
                GDKfree(alig->cigar);
@@ -1111,46 +1196,165 @@ clear_alignment(alignment * alig)
                GDKfree(alig->seq);
        if (alig->qual)
                GDKfree(alig->qual);
+    if (bw->type == SAM || bw->dbschema == 1) {
+        if (alig->qname)
+            GDKfree(alig->qname);
+    }
+    if (bw->type == SAM) {
+        if(alig->rname)
+            GDKfree(alig->rname);
+        if(alig->rnext)
+            GDKfree(alig->rnext);
+        if(alig->aux)
+            GDKfree(alig->aux);
+    }
 }
 
 /**
  * Function checks whether or not the character buffers in the given
  * alignment are big enough to hold the given number of characters. If
- * not, it doubles the buffer sizes
+ * not, it doubles the buffer sizes.
+ * 
+ * Function is only used for loading BAM files
  */
-static bit
-check_alignment_buffers(alignment * alig, int cigar_size, int seq_size)
+static inline bit
+check_alignment_buffers(bam_wrapper *bw, alignment * alig, int qname_size, 
+        int cigar_size, int seq_size)
 {
-       bit resized[] = { FALSE, FALSE };
+       bit resized[] = { FALSE, FALSE, FALSE };
+    
+    assert (bw->type == BAM);
+    
+    if (bw->dbschema == 1) {
+        while (qname_size >= alig->qname_size) {
+            resized[0] = TRUE;
+            alig->qname_size *= 2;
+        }
+    }
        while (cigar_size >= alig->cigar_size) {
-               resized[0] = TRUE;
+               resized[1] = TRUE;
                alig->cigar_size *= 2;
        }
        while (seq_size >= alig->seq_size) {
-               resized[1] = TRUE;
+               resized[2] = TRUE;
                alig->seq_size *= 2;
        }
-       if (resized[0])
+    if (resized[0])
                alig->cigar =
                        GDKrealloc(alig->cigar,
                                   alig->cigar_size * sizeof(char));
-       if (resized[1]) {
+       if (resized[1])
+               alig->cigar =
+                       GDKrealloc(alig->cigar,
+                                  alig->cigar_size * sizeof(char));
+       if (resized[2]) {
                alig->seq =
                        GDKrealloc(alig->seq, alig->seq_size * sizeof(char));
                alig->qual =
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to