Changeset: c177fd86abb4 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c177fd86abb4
Modified Files:
        sql/backends/monet5/bam/Makefile.ag
        sql/backends/monet5/bam/bam_export.c
        sql/backends/monet5/bam/bam_wrapper.c
        sql/backends/monet5/bam/bam_wrapper.h
Branch: bamloader
Log Message:

reverting changes that introduce HTS library (and dependent changes) (here: 
changeset d149e885d77d)

HTS library is not yet widely available (in particular not on Fedora);
hence, for now we fall-back to using SAMTOOLS;
later, we will "gradually" shift towards using the HTS library.

(Robin, Sjoerd & Stefan)


diffs (truncated from 613 to 300 lines):

diff --git a/sql/backends/monet5/bam/Makefile.ag 
b/sql/backends/monet5/bam/Makefile.ag
--- a/sql/backends/monet5/bam/Makefile.ag
+++ b/sql/backends/monet5/bam/Makefile.ag
@@ -30,7 +30,8 @@ INCLUDES = .. \
        ../../../../common/stream \
        ../../../../gdk \
        ../../../../tools/merovingian \
-       ../../../../tools/merovingian/daemon
+       ../../../../tools/merovingian/daemon \
+       $(SAMTOOLS_CFLAGS)
 
 lib__bam = {
        MODULE
@@ -43,7 +44,7 @@ lib__bam = {
                          bam_export.c bam_export.h
        LIBS = ../../../../monetdb5/tools/libmonetdb5 \
                   ../../../../gdk/libbat \
-                  -lhts
+                  $(SAMTOOLS_LIBS)
 }
 
 headers_mal = {
diff --git a/sql/backends/monet5/bam/bam_export.c 
b/sql/backends/monet5/bam/bam_export.c
--- a/sql/backends/monet5/bam/bam_export.c
+++ b/sql/backends/monet5/bam/bam_export.c
@@ -27,11 +27,10 @@
 
 #include "monetdb_config.h"
 
-#include <htslib/sam.h>
-#include <htslib/kstring.h>
+#include <samtools/bam.h>
 
 #include "bam_globals.h"
-//#include "bam_db_interface.h"
+#include "bam_db_interface.h"
 #include "bam_export.h"
 
 
@@ -44,6 +43,189 @@ typedef struct bam_field {
 } bam_field;
 
 
+/**
+ * Copied directly from bam.h/bam_import.c for use by fill_bam_alig
+ * Can not change the calls to realloc to GDKrealloc, since
+ * bam_destroy1 does not use GDKfree...
+ */
+#ifndef kroundup32
+/*! @function
+  @abstract  Round an integer to the next closest power-2 integer.
+  @param  x  integer to be rounded (in place)
+  @discussion x will be modified.
+ */
+#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, 
(x)|=(x)>>8, (x)|=(x)>>16, ++(x))
+#endif
+static inline uint8_t *
+alloc_data(bam1_t *b, int size)
+{
+       if (b->m_data < size) {
+               b->m_data = size;
+               kroundup32(b->m_data);
+               b->data = (uint8_t*)realloc(b->data, b->m_data);
+       }
+       return b->data;
+}
+
+/* Error macro */
+#define FILL_BAM_ALIG_ERR "Error processing alignment '%d': "
+
+/**
+ * I had to write this function, which contains much low level
+ * function calls to the Samtools library, since the sam_read1 function
+ * contained in bam.h/bam_import.c only works on files. This function is
+ * an adjusted variant of the sam_read1 function that gets its fields
+ * directly instead of reads it from a tamFile.
+ * I wrote this function since I did not feel like first writing everything
+ * to a SAM file and then applying sam_read1 since that is just a waste of I/O.
+ * Note that I did do a write to/read from SAM for the BAM header, but headers 
are
+ * insignificant compared to alignment data.
+ *
+ * I also added realloc checking, since this was not done in the Samtools code
+ * and i properly throw a MAL_MALLOC_FAIL whenever applicable.
+ */
+static str
+fill_bam_alig(str qname, sht flag, str rname, int pos,
+       sht mapq, str cigar, str rnext, int pnext,
+       int tlen, str seq, str qual,
+       bam_header_t *header, bam1_t *b, int alignment_nr)
+{
+       int doff = 0;
+       bam1_core_t *c = &b->core;
+
+       /* Note: in sam_read1, str->s is often used. This string is not NULL 
terminated! */
+       { // qname
+               c->l_qname = strlen(qname) + 1;
+               if(alloc_data(b, doff + c->l_qname) == NULL) {
+                       throw(MAL, "fill_bam_alig",
+                               FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, 
alignment_nr);
+               }
+               memcpy(b->data + doff, qname, c->l_qname);
+               doff += c->l_qname;
+       }
+       { // flag
+               c->flag = flag;
+       }
+       { // rname, pos, mapq
+               c->tid = bam_get_tid(header, rname);
+               if (c->tid < 0 && strcmp(rname, "*") != 0) {
+                       /* Should not happen, since we built the header 
structure ourselves */
+                       throw(MAL, "fill_bam_alig",
+                               FILL_BAM_ALIG_ERR "SQ entry '%s' not found in 
header table",
+                               alignment_nr, rname);
+               }
+               c->pos = pos - 1;
+               c->qual = mapq;
+       }
+       { // cigar
+               str s, t;
+               int i, op;
+               long x;
+               c->n_cigar = 0;
+               if (cigar[0] != '*') {
+                       uint32_t *cigar_enc;
+                       for (s = cigar; *s != '\0'; ++s) {
+                               if ((isalpha(*s)) || (*s=='=')) {
+                                       ++c->n_cigar;
+                               }
+                               else if (!isdigit(*s)) {
+                                       throw(MAL, "fill_bam_alig",
+                                               FILL_BAM_ALIG_ERR "Parse error 
while parsing CIGAR string '%s'",
+                                               alignment_nr, cigar);
+                               }
+                       }
+                       if (alloc_data(b, doff + c->n_cigar * 4) == NULL) {
+                               throw(MAL, "fill_bam_alig",
+                                       FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, 
alignment_nr);
+                       }
+                       cigar_enc = bam1_cigar(b);
+                       for (i = 0, s = cigar; i != c->n_cigar; ++i) {
+                               x = strtol(s, &t, 10);
+                               op = toupper(*t);
+                               if (op == 'M') op = BAM_CMATCH;
+                               else if (op == 'I') op = BAM_CINS;
+                               else if (op == 'D') op = BAM_CDEL;
+                               else if (op == 'N') op = BAM_CREF_SKIP;
+                               else if (op == 'S') op = BAM_CSOFT_CLIP;
+                               else if (op == 'H') op = BAM_CHARD_CLIP;
+                               else if (op == 'P') op = BAM_CPAD;
+                               else if (op == '=') op = BAM_CEQUAL;
+                               else if (op == 'X') op = BAM_CDIFF;
+                               else if (op == 'B') op = BAM_CBACK;
+                               else throw(MAL, "fill_bam_alig",
+                                       FILL_BAM_ALIG_ERR "invalid CIGAR 
operation found in CIGAR string '%s': '%c'",
+                                       alignment_nr, cigar, op);
+                               s = t + 1;
+                               cigar_enc[i] = bam_cigar_gen(x, op);
+                       }
+                       if (*s) throw(MAL, "fill_bam_alig",
+                               FILL_BAM_ALIG_ERR "Unmatched CIGAR operation in 
CIGAR string '%s'",
+                               alignment_nr, cigar);
+                       c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar_enc));
+                       doff += c->n_cigar * 4;
+               } else {
+                       if (!(c->flag&BAM_FUNMAP)) {
+                               c->flag |= BAM_FUNMAP;
+                       }
+                       c->bin = bam_reg2bin(c->pos, c->pos + 1);
+               }
+       }
+       { // rnext, pnext, tlen
+               c->mtid = strcmp(rnext, "=") != 0 ? bam_get_tid(header, rnext) 
: c->tid;
+               c->mpos = pnext - 1;
+               c->isize = tlen;
+       }
+       { // seq and qual
+               int i;
+               uint8_t *p = 0;
+               if (strcmp(seq, "*") != 0) {
+                       c->l_qseq = strlen(seq);
+                       if (c->n_cigar && c->l_qseq != 
(int32_t)bam_cigar2qlen(c, bam1_cigar(b))) {
+                               throw(MAL, "fill_bam_alig",
+                                       FILL_BAM_ALIG_ERR "CIGAR and sequence 
length inconsistency: %d (SEQ) vs %d (CIGAR)",
+                                       alignment_nr, c->l_qseq, 
(int32_t)bam_cigar2qlen(c, bam1_cigar(b)));
+                       }
+                       p = (uint8_t*)alloc_data(b, doff + c->l_qseq + 
(c->l_qseq+1)/2) + doff;
+                       if(p == NULL) {
+                               throw(MAL, "fill_bam_alig",
+                                       FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, 
alignment_nr);
+                       }
+                       memset(p, 0, (c->l_qseq+1)/2);
+                       for (i = 0; i < c->l_qseq; ++i)
+                               p[i/2] |= bam_nt16_table[(int)seq[i]] << 
4*(1-i%2);
+               } else {
+                       c->l_qseq = 0;
+               }
+               if (strcmp(qual, "*") != 0 && (size_t)c->l_qseq != 
strlen(qual)) {
+                       throw(MAL, "fill_bam_alig",
+                               FILL_BAM_ALIG_ERR "SEQ and QUAL are 
inconsistent",
+                               alignment_nr);
+               }
+               p += (c->l_qseq+1)/2;
+               if (strcmp(qual, "*") == 0) {
+                       for (i = 0; i < c->l_qseq; ++i) {
+                               p[i] = 0xff;
+                       }
+               } else {
+                       for (i = 0; i < c->l_qseq; ++i) {
+                               p[i] = qual[i] - 33;
+                       }
+               }
+               doff += c->l_qseq + (c->l_qseq+1)/2;
+       }
+
+       b->data_len = doff;
+       if (bam_no_B) {
+               bam_remove_B(b);
+       }
+
+       return MAL_SUCCEED;
+}
+
+
+
+
+
 static str
 bind_export_result(Client cntxt, MalBlkPtr mb, bam_field fields[11], int 
*tuple_count)
 {
@@ -190,7 +372,7 @@ write_header(stream *output, bam_field f
                //mnstr_printf(output, "%s\t0\n", sq_table[i]);
        }
 
-       mnstr_printf(output, "@CO\tSAM file generated by MonetDB BAM 
library\n");
+       mnstr_printf(output, "@CO\tSAM file generated by MonetDB\n");
 
 cleanup:
        if (sq_table) {
@@ -229,7 +411,7 @@ sam_export(Client cntxt, MalBlkPtr mb, M
        int tuple_count = 0;
 
        int i;
-       //str sql;
+       str sql;
        str msg = MAL_SUCCEED;
 
        memset(fields, 0, 11 * sizeof(bam_field));
@@ -255,11 +437,11 @@ sam_export(Client cntxt, MalBlkPtr mb, M
        }
 
        /* If we got here, we succesfully exported the result. Drop all data in 
export table */
-       /*sql = "DELETE FROM bam.export;";
+       sql = "DELETE FROM bam.export;";
        RUN_SQL(cntxt, &sql, "bam.drop_export", msg);
        if (msg != MAL_SUCCEED) {
                REUSE_EXCEPTION(msg, MAL, "sam_export", "Could not clear the 
export table after exporting: %s", msg);
-       }*/
+       }
 
        (void)stk;
        (void)pci;
@@ -274,43 +456,38 @@ cleanup:
 }
 
 
-/**
- * bam_export uses htslib functionality to write the export table to an actual 
bam file
- * It does this by first writing the SAM header to a temporary SAM file, since 
the htslib provides 
- * convenient header parsing functionality on files. 
- * We could do a similar thing for the alignments, but this would add a whole 
lot of I/O. So we took
- * some extra effort to write the alignments to bam1_t structs ourselves (of 
course with help of
- * htslib )
- */
+
 str
-bam_export(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) {
+bam_export(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci)
+{
        /* arg 1: path to desired output file */
        str output_path = *(str *) getArgReference(stk, pci, pci->retc);
 
-       /* Stuff necessary for writing header to BAM */
-       char header_path[1024] = "";
-       stream *output_header = NULL;
-       samFile *sam_header = NULL;
-       bam_hdr_t *header = NULL;
-
-       /* Actual BAM output file and vars to write alignments */
-       samFile *output = NULL;
-       kstring_t sam_line;
-       bam1_t *alig = NULL;
-       char parse_err_buf[2048];
-       FILE *original_stderr;
-
-       /* Others */
+       bamFile output = NULL;
        bam_field fields[11];
        int tuple_count = 0;
+
+       char output_header_path[1024] = "";
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to