Changeset: d149e885d77d for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=d149e885d77d Modified Files: sql/backends/monet5/bam/Makefile.ag sql/backends/monet5/bam/bam_export.c sql/backends/monet5/bam/bam_wrapper.c sql/backends/monet5/bam/bam_wrapper.h Branch: bamloader Log Message:
finished bam_export. For this, switched whole bam library code from libbam to libhts since it provides better functionality for our purpose. diffs (truncated from 612 to 300 lines): diff --git a/sql/backends/monet5/bam/Makefile.ag b/sql/backends/monet5/bam/Makefile.ag --- a/sql/backends/monet5/bam/Makefile.ag +++ b/sql/backends/monet5/bam/Makefile.ag @@ -30,8 +30,7 @@ INCLUDES = .. \ ../../../../common/stream \ ../../../../gdk \ ../../../../tools/merovingian \ - ../../../../tools/merovingian/daemon \ - $(SAMTOOLS_CFLAGS) + ../../../../tools/merovingian/daemon lib__bam = { MODULE @@ -44,7 +43,7 @@ lib__bam = { bam_export.c bam_export.h LIBS = ../../../../monetdb5/tools/libmonetdb5 \ ../../../../gdk/libbat \ - $(SAMTOOLS_LIBS) + -lhts } headers_mal = { diff --git a/sql/backends/monet5/bam/bam_export.c b/sql/backends/monet5/bam/bam_export.c --- a/sql/backends/monet5/bam/bam_export.c +++ b/sql/backends/monet5/bam/bam_export.c @@ -27,10 +27,11 @@ #include "monetdb_config.h" -#include <samtools/bam.h> +#include <htslib/sam.h> +#include <htslib/kstring.h> #include "bam_globals.h" -#include "bam_db_interface.h" +//#include "bam_db_interface.h" #include "bam_export.h" @@ -43,189 +44,6 @@ typedef struct bam_field { } bam_field; -/** - * Copied directly from bam.h/bam_import.c for use by fill_bam_alig - * Can not change the calls to realloc to GDKrealloc, since - * bam_destroy1 does not use GDKfree... - */ -#ifndef kroundup32 -/*! @function - @abstract Round an integer to the next closest power-2 integer. - @param x integer to be rounded (in place) - @discussion x will be modified. - */ -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif -static inline uint8_t * -alloc_data(bam1_t *b, int size) -{ - if (b->m_data < size) { - b->m_data = size; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); - } - return b->data; -} - -/* Error macro */ -#define FILL_BAM_ALIG_ERR "Error processing alignment '%d': " - -/** - * I had to write this function, which contains much low level - * function calls to the Samtools library, since the sam_read1 function - * contained in bam.h/bam_import.c only works on files. This function is - * an adjusted variant of the sam_read1 function that gets its fields - * directly instead of reads it from a tamFile. - * I wrote this function since I did not feel like first writing everything - * to a SAM file and then applying sam_read1 since that is just a waste of I/O. - * Note that I did do a write to/read from SAM for the BAM header, but headers are - * insignificant compared to alignment data. - * - * I also added realloc checking, since this was not done in the Samtools code - * and i properly throw a MAL_MALLOC_FAIL whenever applicable. - */ -static str -fill_bam_alig(str qname, sht flag, str rname, int pos, - sht mapq, str cigar, str rnext, int pnext, - int tlen, str seq, str qual, - bam_header_t *header, bam1_t *b, int alignment_nr) -{ - int doff = 0; - bam1_core_t *c = &b->core; - - /* Note: in sam_read1, str->s is often used. This string is not NULL terminated! */ - { // qname - c->l_qname = strlen(qname) + 1; - if(alloc_data(b, doff + c->l_qname) == NULL) { - throw(MAL, "fill_bam_alig", - FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, alignment_nr); - } - memcpy(b->data + doff, qname, c->l_qname); - doff += c->l_qname; - } - { // flag - c->flag = flag; - } - { // rname, pos, mapq - c->tid = bam_get_tid(header, rname); - if (c->tid < 0 && strcmp(rname, "*") != 0) { - /* Should not happen, since we built the header structure ourselves */ - throw(MAL, "fill_bam_alig", - FILL_BAM_ALIG_ERR "SQ entry '%s' not found in header table", - alignment_nr, rname); - } - c->pos = pos - 1; - c->qual = mapq; - } - { // cigar - str s, t; - int i, op; - long x; - c->n_cigar = 0; - if (cigar[0] != '*') { - uint32_t *cigar_enc; - for (s = cigar; *s != '\0'; ++s) { - if ((isalpha(*s)) || (*s=='=')) { - ++c->n_cigar; - } - else if (!isdigit(*s)) { - throw(MAL, "fill_bam_alig", - FILL_BAM_ALIG_ERR "Parse error while parsing CIGAR string '%s'", - alignment_nr, cigar); - } - } - if (alloc_data(b, doff + c->n_cigar * 4) == NULL) { - throw(MAL, "fill_bam_alig", - FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, alignment_nr); - } - cigar_enc = bam1_cigar(b); - for (i = 0, s = cigar; i != c->n_cigar; ++i) { - x = strtol(s, &t, 10); - op = toupper(*t); - if (op == 'M') op = BAM_CMATCH; - else if (op == 'I') op = BAM_CINS; - else if (op == 'D') op = BAM_CDEL; - else if (op == 'N') op = BAM_CREF_SKIP; - else if (op == 'S') op = BAM_CSOFT_CLIP; - else if (op == 'H') op = BAM_CHARD_CLIP; - else if (op == 'P') op = BAM_CPAD; - else if (op == '=') op = BAM_CEQUAL; - else if (op == 'X') op = BAM_CDIFF; - else if (op == 'B') op = BAM_CBACK; - else throw(MAL, "fill_bam_alig", - FILL_BAM_ALIG_ERR "invalid CIGAR operation found in CIGAR string '%s': '%c'", - alignment_nr, cigar, op); - s = t + 1; - cigar_enc[i] = bam_cigar_gen(x, op); - } - if (*s) throw(MAL, "fill_bam_alig", - FILL_BAM_ALIG_ERR "Unmatched CIGAR operation in CIGAR string '%s'", - alignment_nr, cigar); - c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar_enc)); - doff += c->n_cigar * 4; - } else { - if (!(c->flag&BAM_FUNMAP)) { - c->flag |= BAM_FUNMAP; - } - c->bin = bam_reg2bin(c->pos, c->pos + 1); - } - } - { // rnext, pnext, tlen - c->mtid = strcmp(rnext, "=") != 0 ? bam_get_tid(header, rnext) : c->tid; - c->mpos = pnext - 1; - c->isize = tlen; - } - { // seq and qual - int i; - uint8_t *p = 0; - if (strcmp(seq, "*") != 0) { - c->l_qseq = strlen(seq); - if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) { - throw(MAL, "fill_bam_alig", - FILL_BAM_ALIG_ERR "CIGAR and sequence length inconsistency: %d (SEQ) vs %d (CIGAR)", - alignment_nr, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); - } - p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; - if(p == NULL) { - throw(MAL, "fill_bam_alig", - FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, alignment_nr); - } - memset(p, 0, (c->l_qseq+1)/2); - for (i = 0; i < c->l_qseq; ++i) - p[i/2] |= bam_nt16_table[(int)seq[i]] << 4*(1-i%2); - } else { - c->l_qseq = 0; - } - if (strcmp(qual, "*") != 0 && (size_t)c->l_qseq != strlen(qual)) { - throw(MAL, "fill_bam_alig", - FILL_BAM_ALIG_ERR "SEQ and QUAL are inconsistent", - alignment_nr); - } - p += (c->l_qseq+1)/2; - if (strcmp(qual, "*") == 0) { - for (i = 0; i < c->l_qseq; ++i) { - p[i] = 0xff; - } - } else { - for (i = 0; i < c->l_qseq; ++i) { - p[i] = qual[i] - 33; - } - } - doff += c->l_qseq + (c->l_qseq+1)/2; - } - - b->data_len = doff; - if (bam_no_B) { - bam_remove_B(b); - } - - return MAL_SUCCEED; -} - - - - - static str bind_export_result(Client cntxt, MalBlkPtr mb, bam_field fields[11], int *tuple_count) { @@ -372,7 +190,7 @@ write_header(stream *output, bam_field f //mnstr_printf(output, "%s\t0\n", sq_table[i]); } - mnstr_printf(output, "@CO\tSAM file generated by MonetDB\n"); + mnstr_printf(output, "@CO\tSAM file generated by MonetDB BAM library\n"); cleanup: if (sq_table) { @@ -411,7 +229,7 @@ sam_export(Client cntxt, MalBlkPtr mb, M int tuple_count = 0; int i; - str sql; + //str sql; str msg = MAL_SUCCEED; memset(fields, 0, 11 * sizeof(bam_field)); @@ -437,11 +255,11 @@ sam_export(Client cntxt, MalBlkPtr mb, M } /* If we got here, we succesfully exported the result. Drop all data in export table */ - sql = "DELETE FROM bam.export;"; + /*sql = "DELETE FROM bam.export;"; RUN_SQL(cntxt, &sql, "bam.drop_export", msg); if (msg != MAL_SUCCEED) { REUSE_EXCEPTION(msg, MAL, "sam_export", "Could not clear the export table after exporting: %s", msg); - } + }*/ (void)stk; (void)pci; @@ -456,38 +274,43 @@ cleanup: } - +/** + * bam_export uses htslib functionality to write the export table to an actual bam file + * It does this by first writing the SAM header to a temporary SAM file, since the htslib provides + * convenient header parsing functionality on files. + * We could do a similar thing for the alignments, but this would add a whole lot of I/O. So we took + * some extra effort to write the alignments to bam1_t structs ourselves (of course with help of + * htslib ) + */ str -bam_export(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) -{ +bam_export(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { /* arg 1: path to desired output file */ str output_path = *(str *) getArgReference(stk, pci, pci->retc); - bamFile output = NULL; + /* Stuff necessary for writing header to BAM */ + char header_path[1024] = ""; + stream *output_header = NULL; + samFile *sam_header = NULL; + bam_hdr_t *header = NULL; + + /* Actual BAM output file and vars to write alignments */ + samFile *output = NULL; + kstring_t sam_line; + bam1_t *alig = NULL; + char parse_err_buf[2048]; + FILE *original_stderr; + + /* Others */ bam_field fields[11]; int tuple_count = 0; - - char output_header_path[1024] = ""; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list