Changeset: 19aee3b937e8 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=19aee3b937e8 Modified Files: sql/backends/monet5/bam/bam_loader.c sql/backends/monet5/bam/bam_wrapper.c sql/backends/monet5/bam/bam_wrapper.h Branch: bamloader Log Message:
Finished SAM loader functionality. Turned out to be a lot of work, since Samtools wouldn't cooperate. Therefore, a SAM file is now entirely parsed by this BAM library, without any help of Samtools. Test update will follow soon. diffs (truncated from 1146 to 300 lines): diff --git a/sql/backends/monet5/bam/bam_loader.c b/sql/backends/monet5/bam/bam_loader.c --- a/sql/backends/monet5/bam/bam_loader.c +++ b/sql/backends/monet5/bam/bam_loader.c @@ -283,7 +283,7 @@ bam_loader(Client cntxt, MalBlkPtr mb, s /* Parse all headers */ for (i = 0; i < nr_files; ++i) { - TO_LOG("<bam_loader> Parsing BAM header for file '%s'...\n", + TO_LOG("<bam_loader> Parsing header for file '%s'...\n", filenames[i]); if ((msg = process_header(bws + i)) != MAL_SUCCEED) { goto cleanup; @@ -310,7 +310,7 @@ bam_loader(Client cntxt, MalBlkPtr mb, s /* Create alignment storage */ for (i = 0; i < nr_files; ++i) { - TO_LOG("<bam_loader> Creating alignment tables for BAM file '%s'...\n", filenames[i]); + TO_LOG("<bam_loader> Creating alignment tables for file '%s'...\n", filenames[i]); if ((dbschema == 0 && create_alignment_storage_0(cntxt, "bam.create_storage_0", diff --git a/sql/backends/monet5/bam/bam_wrapper.c b/sql/backends/monet5/bam/bam_wrapper.c --- a/sql/backends/monet5/bam/bam_wrapper.c +++ b/sql/backends/monet5/bam/bam_wrapper.c @@ -106,22 +106,63 @@ init_bam_wrapper(bam_wrapper * bw, filet ERR_INIT_BAM_WRAPPER "BAM file could not be opened", file_location); } - if ((bw->header = bam_header_read(bw->bam.input)) == NULL) { + if ((bw->bam.header = bam_header_read(bw->bam.input)) == NULL) { throw(MAL, "init_bam_wrapper", ERR_INIT_BAM_WRAPPER "Unable to read header from file", file_location); } } else { /* Open SAM file and read its header */ - if ((bw->sam.input = samopen(file_location, "r", NULL)) == NULL) { + int bufsize = 4096; + lng header_len = 0; + if ((bw->sam.input = open_rastream(file_location)) == NULL) { throw(MAL, "init_bam_wrapper", ERR_INIT_BAM_WRAPPER "SAM file could not be opened", file_location); } - if ((bw->header = bw->sam.input->header) == NULL) { - throw(MAL, "init_bam_wrapper", - ERR_INIT_BAM_WRAPPER "Unable to read header from file", - file_location); + if ((bw->sam.header = (str)GDKmalloc(bufsize * sizeof(char))) == NULL) { + throw(MAL, "init_bam_wrapper", + ERR_INIT_BAM_WRAPPER MAL_MALLOC_FAIL, file_location); + } + while (TRUE) { + int read = mnstr_readline(bw->sam.input, bw->sam.header + header_len, bufsize - header_len); + + if (read <= 0) { + throw(MAL, "init_bam_wrapper", + ERR_INIT_BAM_WRAPPER "Could not read line of SAM header", + file_location); + } + + if (bw->sam.header[header_len] != '@') { + /* This is not a header line, we assume that the header is finished. + * Rewind stream to start of line and stop reading */ + if (mnstr_fsetpos(bw->sam.input, header_len) < 0) { + throw(MAL, "init_bam_wrapper", + ERR_INIT_BAM_WRAPPER "Could not read last line of SAM header", + file_location); + } + bw->sam.header[header_len] = '\0'; /* Truncate alignment data */ + break; + } + + if (bw->sam.header[header_len+read-1] != '\n') { + /* This line was not completed. Increase buffer size, rewind stream + * and try again */ + bufsize *= 2; + if ((bw->sam.header = (str)GDKrealloc(bw->sam.header, bufsize * sizeof(char))) == NULL) { + throw(MAL, "init_bam_wrapper", + ERR_INIT_BAM_WRAPPER MAL_MALLOC_FAIL, file_location); + } + if (mnstr_fsetpos(bw->sam.input, header_len) < 0) { + throw(MAL, "init_bam_wrapper", + ERR_INIT_BAM_WRAPPER "Could not read last line of SAM header", + file_location); + } + continue; + } + + /* Only if no special cases occured, we added the read characters to the header */ + header_len += read; } } @@ -326,15 +367,18 @@ clear_bam_wrapper(bam_wrapper * bw) /* Clear bam/sam specific fields */ if (bw->type == BAM) { - if (bw->header) { - bam_header_destroy(bw->header); + if (bw->bam.header) { + bam_header_destroy(bw->bam.header); } if (bw->bam.input) { bam_close(bw->bam.input); } } else { if (bw->sam.input) { - samclose(bw->sam.input); + close_stream(bw->sam.input); + } + if (bw->sam.header) { + GDKfree(bw->sam.header); } } @@ -630,7 +674,7 @@ clear_bam_header_line(bam_header_line * if (strcmp((opt).tag, (cmp)) == 0) { \ (l) = strtol((opt).value, &s, 10); \ if ((s) == ((opt).value) || (l) == LONG_MIN || (l) == LONG_MAX) \ - l = -1; \ + l = -1; \ if (!APPEND_LNG(file, l)) { \ throw(MAL, "process_header", ERR_PROCESS_HEADER "Could not write option '%s:%s' to binary file", \ bw->file_location, (opt).tag, (opt).value); \ @@ -657,7 +701,7 @@ clear_bam_header_line(bam_header_line * str process_header(bam_wrapper * bw) { - str header_str = bw->header->text; + str header_str; bam_header_line hl; int i, o; @@ -674,6 +718,12 @@ process_header(bam_wrapper * bw) bit eof = FALSE; str s; lng l; + + if (bw->type == BAM) { + header_str = bw->bam.header->text; + } else { + header_str = bw->sam.header; + } hd_comment.l = hd_comment.m = 0; hd_comment.s = NULL; @@ -1066,7 +1116,7 @@ process_header(bam_wrapper * bw) typedef struct alignment { lng virtual_offset; - char qname[256]; /* Can't be bigger than 256, since this is the max that Samtools can handle */ + str qname; sht flag; str rname; int pos; @@ -1077,33 +1127,68 @@ typedef struct alignment { int tlen; str seq; str qual; + + str aux; /* Only used when loading SAM */ - int cigar_size;; - int seq_size; + int qname_size; /* Current buffer size of qname strs (Only used when type = SAM or dbschema = 1) */ + int rname_size; /* Current buffer size of rname/rnext strs (only used when loading SAM) */ + int cigar_size; /* Current buffer size of cigar str */ + int seq_size; /* Current buffer size of seq and qual str */ + int aux_size; /* Current buffer size of aux (only used when loading SAM) */ bit written; } alignment; static bit -init_alignment(alignment * alig) +init_alignment(bam_wrapper *bw, alignment * alig) { + bit result; + /* Enables clear function to check variables */ memset(alig, 0, sizeof(alignment)); - alig->cigar_size = 1024; - alig->seq_size = 128; + alig->qname_size = 10000; + alig->rname_size = 10000; + alig->cigar_size = 10000; + alig->seq_size = 10000; + alig->aux_size = 10000; /* Dynamic buffers to be able to expand when necessary */ alig->cigar = (str) GDKmalloc(alig->cigar_size * sizeof(char)); alig->seq = (str) GDKmalloc(alig->seq_size * sizeof(char)); alig->qual = (str) GDKmalloc(alig->seq_size * sizeof(char)); + + result = (alig->cigar != NULL && alig->seq != NULL + && alig->qual != NULL); + + if (bw->type == SAM || bw->dbschema == 1) { + /* In this case we need to allocate space for the qname string + * type == SAM => qname not stored in bam1_t struct + * dbschema == 1 => Alignments need to be stored for more than one + * iteration + * In other cases, it remains NULL and it will point to the qname + * in the bam1_t struct + */ + alig->qname = (str) GDKmalloc(alig->qname_size * sizeof(char)); + } + if (bw->type == SAM) { + /* If we are loading SAM, we need buffers for rname and rnext, + * since this is not taken care of by a bam_header_t dict now */ + alig->rname = (str) GDKmalloc(alig->rname_size * sizeof(char)); + alig->rnext = (str) GDKmalloc(alig->rname_size * sizeof(char)); - return (alig->cigar != NULL && alig->seq != NULL - && alig->qual != NULL); + /* For SAM, we also need to have a buffer for aux, since this is + * not stored in a bam1_t struct anymore */ + alig->aux = (str) GDKmalloc(alig->aux_size * sizeof(char)); + result = (result && alig->rname != NULL && + alig->rnext != NULL && alig->aux != NULL); + } + + return result; } static void -clear_alignment(alignment * alig) +clear_alignment(bam_wrapper *bw, alignment * alig) { if (alig->cigar) GDKfree(alig->cigar); @@ -1111,46 +1196,165 @@ clear_alignment(alignment * alig) GDKfree(alig->seq); if (alig->qual) GDKfree(alig->qual); + if (bw->type == SAM || bw->dbschema == 1) { + if (alig->qname) + GDKfree(alig->qname); + } + if (bw->type == SAM) { + if(alig->rname) + GDKfree(alig->rname); + if(alig->rnext) + GDKfree(alig->rnext); + if(alig->aux) + GDKfree(alig->aux); + } } /** * Function checks whether or not the character buffers in the given * alignment are big enough to hold the given number of characters. If - * not, it doubles the buffer sizes + * not, it doubles the buffer sizes. + * + * Function is only used for loading BAM files */ -static bit -check_alignment_buffers(alignment * alig, int cigar_size, int seq_size) +static inline bit +check_alignment_buffers(bam_wrapper *bw, alignment * alig, int qname_size, + int cigar_size, int seq_size) { - bit resized[] = { FALSE, FALSE }; + bit resized[] = { FALSE, FALSE, FALSE }; + + assert (bw->type == BAM); + + if (bw->dbschema == 1) { + while (qname_size >= alig->qname_size) { + resized[0] = TRUE; + alig->qname_size *= 2; + } + } while (cigar_size >= alig->cigar_size) { - resized[0] = TRUE; + resized[1] = TRUE; alig->cigar_size *= 2; } while (seq_size >= alig->seq_size) { - resized[1] = TRUE; + resized[2] = TRUE; alig->seq_size *= 2; } - if (resized[0]) + if (resized[0]) alig->cigar = GDKrealloc(alig->cigar, alig->cigar_size * sizeof(char)); - if (resized[1]) { + if (resized[1]) + alig->cigar = + GDKrealloc(alig->cigar, + alig->cigar_size * sizeof(char)); + if (resized[2]) { alig->seq = GDKrealloc(alig->seq, alig->seq_size * sizeof(char)); alig->qual = _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list