Changeset: 97d38cacec07 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=97d38cacec07 Modified Files: monetdb5/optimizer/opt_support.c sql/backends/monet5/bam/bam_db_interface.c sql/backends/monet5/bam/bam_export.c sql/backends/monet5/bam/bam_wrapper.c Branch: bamloader Log Message:
Implemented bam_export function and made earlier written functionality for SAM export more generic. Only problem left is that the SAM header does not get parsed properly for some reason. Therefore, no dictionary is built containing the different chromosomes. This will need some extra time to tackle. diffs (truncated from 644 to 300 lines): diff --git a/monetdb5/optimizer/opt_support.c b/monetdb5/optimizer/opt_support.c --- a/monetdb5/optimizer/opt_support.c +++ b/monetdb5/optimizer/opt_support.c @@ -840,13 +840,9 @@ int isAllScalar(MalBlkPtr mb, InstrPtr p int isMapOp(InstrPtr p){ return getModuleId(p) && ((getModuleId(p) == malRef && getFunctionId(p) == multiplexRef) || - (getModuleId(p)== batcalcRef && getFunctionId(p) != mark_grpRef && getFunctionId(p) != rank_grpRef) || - (getModuleId(p)== batmtimeRef) || - (getModuleId(p)== batstrRef) || - (getModuleId(p)== batmmathRef) || - (getModuleId(p)== batxmlRef) || - (strcmp(getModuleId(p),"batsql") == 0) || - (getModuleId(p)== mkeyRef)); + (getModuleId(p) == batcalcRef && getFunctionId(p) != mark_grpRef && getFunctionId(p) != rank_grpRef) || + (getModuleId(p) != batcalcRef && strncmp(getModuleId(p), "bat", 3) == 0) || + (getModuleId(p) == mkeyRef)); } int isLikeOp(InstrPtr p){ diff --git a/sql/backends/monet5/bam/bam_db_interface.c b/sql/backends/monet5/bam/bam_db_interface.c --- a/sql/backends/monet5/bam/bam_db_interface.c +++ b/sql/backends/monet5/bam/bam_db_interface.c @@ -299,6 +299,8 @@ next_file_id(mvc * m, sql_table * files, BATiter li; BUN p = 0, q = 0; lng max_file_id = 0; + + sht i; assert(m != NULL); assert(files != NULL); @@ -309,15 +311,17 @@ next_file_id(mvc * m, sql_table * files, "Could not retrieve the next file id: Error binding file_id column of 'files' table"); } - /* Loop through BAT for this column and find the maximum file_id */ - b = store_funcs.bind_col(m->session->tr, c, RD_INS); + /* Loop through BATs for this column and find the maximum file_id */ + for(i=0; i<3; ++i) { + b = store_funcs.bind_col(m->session->tr, c, i); - li = bat_iterator(b); - BATloop(b, p, q) { - lng t = *(lng *) BUNtail(li, p); - max_file_id = MAX(max_file_id, t); + li = bat_iterator(b); + BATloop(b, p, q) { + lng t = *(lng *) BUNtail(li, p); + max_file_id = MAX(max_file_id, t); + } + BBPreleaseref(b->batCacheid); } - BBPreleaseref(b->batCacheid); *next_file_id = max_file_id + 1; return MAL_SUCCEED; diff --git a/sql/backends/monet5/bam/bam_export.c b/sql/backends/monet5/bam/bam_export.c --- a/sql/backends/monet5/bam/bam_export.c +++ b/sql/backends/monet5/bam/bam_export.c @@ -27,6 +27,8 @@ #include "monetdb_config.h" +#include <samtools/bam.h> + #include "bam_globals.h" #include "bam_export.h" @@ -40,46 +42,210 @@ typedef struct bam_field { } bam_field; -#define CUR_STR(field, i) ((str) BUNtail(field.iter, (field.cur+i))) -#define CUR_SHT(field, i) (*(sht *) BUNtail(field.iter, (field.cur+i))) -#define CUR_INT(field, i) (*(int *) BUNtail(field.iter, (field.cur+i))) +/** + * Copied directly from bam.h/bam_import.c for use by fill_bam_alig + * Can not change the call to realloc to GDKrealloc, since + * bam_destroy1 does not use GDKfree... + */ +#ifndef kroundup32 +/*! @function + @abstract Round an integer to the next closest power-2 integer. + @param x integer to be rounded (in place) + @discussion x will be modified. + */ +#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) +#endif +static inline uint8_t * +alloc_data(bam1_t *b, int size) +{ + if (b->m_data < size) { + b->m_data = size; + kroundup32(b->m_data); + b->data = (uint8_t*)realloc(b->data, b->m_data); + } + return b->data; +} -str -sam_export(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +/* Error macro */ +#define FILL_BAM_ALIG_ERR "Error processing alignment '%d': " + +/** + * I had to write this function, which contains much low level + * function calls to the Samtools library, since the sam_read1 function + * contained in bam.h/bam_import.c only works on files. This function is + * an adjusted variant of the sam_read1 function that gets its fields + * directly instead of reads it from a tamFile. + * I wrote this function since I did not feel like first writing everything + * to a SAM file and then applying sam_read1 since that is just a waste of I/O. + * Note that I did do a write to/read from SAM for the BAM header, but headers are + * insignificant compared to alignment data. + * + * I also added realloc checking, since this was not done in the Samtools code + * and i properly throw a MAL_MALLOC_FAIL whenever applicable. + */ +static str +fill_bam_alig(str qname, sht flag, str rname, int pos, + sht mapq, str cigar, str rnext, int pnext, + int tlen, str seq, str qual, + bam_header_t *header, bam1_t *b, int alignment_nr) { - /* arg 1: path to desired output file */ - str output_path = *(str *) getArgReference(stk, pci, pci->retc); + int doff = 0; + bam1_core_t *c = &b->core; + + /* Note: in sam_read1, str->s is often used. This string is not NULL terminated! */ + { // qname + c->l_qname = strlen(qname) + 1; + if(alloc_data(b, doff + c->l_qname) == NULL) { + throw(MAL, "fill_bam_alig", + FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, alignment_nr); + } + memcpy(b->data + doff, qname, c->l_qname); + doff += c->l_qname; + } + { // flag + c->flag = flag; + } + { // rname, pos, mapq + c->tid = bam_get_tid(header, rname); + if (c->tid < 0 && strcmp(rname, "*") != 0) { + /* Should not happen, since we built the header structure ourselves */ + throw(MAL, "fill_bam_alig", + FILL_BAM_ALIG_ERR "SQ entry '%s' not found in header table", + alignment_nr, rname); + } + c->pos = pos - 1; + c->qual = mapq; + } + { // cigar + str s, t; + int i, op; + long x; + c->n_cigar = 0; + if (cigar[0] != '*') { + uint32_t *cigar_enc; + for (s = cigar; *s != '\0'; ++s) { + if ((isalpha(*s)) || (*s=='=')) { + ++c->n_cigar; + } + else if (!isdigit(*s)) { + throw(MAL, "fill_bam_alig", + FILL_BAM_ALIG_ERR "Parse error while parsing CIGAR string '%s'", + alignment_nr, cigar); + } + } + if (alloc_data(b, doff + c->n_cigar * 4) == NULL) { + throw(MAL, "fill_bam_alig", + FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, alignment_nr); + } + cigar_enc = bam1_cigar(b); + for (i = 0, s = cigar; i != c->n_cigar; ++i) { + x = strtol(s, &t, 10); + op = toupper(*t); + if (op == 'M') op = BAM_CMATCH; + else if (op == 'I') op = BAM_CINS; + else if (op == 'D') op = BAM_CDEL; + else if (op == 'N') op = BAM_CREF_SKIP; + else if (op == 'S') op = BAM_CSOFT_CLIP; + else if (op == 'H') op = BAM_CHARD_CLIP; + else if (op == 'P') op = BAM_CPAD; + else if (op == '=') op = BAM_CEQUAL; + else if (op == 'X') op = BAM_CDIFF; + else if (op == 'B') op = BAM_CBACK; + else throw(MAL, "fill_bam_alig", + FILL_BAM_ALIG_ERR "invalid CIGAR operation found in CIGAR string '%s': '%c'", + alignment_nr, cigar, op); + s = t + 1; + cigar_enc[i] = bam_cigar_gen(x, op); + } + if (*s) throw(MAL, "fill_bam_alig", + FILL_BAM_ALIG_ERR "Unmatched CIGAR operation in CIGAR string '%s'", + alignment_nr, cigar); + c->bin = bam_reg2bin(c->pos, bam_calend(c, cigar_enc)); + doff += c->n_cigar * 4; + } else { + if (!(c->flag&BAM_FUNMAP)) { + c->flag |= BAM_FUNMAP; + } + c->bin = bam_reg2bin(c->pos, c->pos + 1); + } + } + { // rnext, pnext, tlen + c->mtid = strcmp(rnext, "=") != 0 ? bam_get_tid(header, rnext) : c->tid; + c->mpos = pnext - 1; + c->isize = tlen; + } + { // seq and qual + int i; + uint8_t *p = 0; + if (strcmp(seq, "*") != 0) { + c->l_qseq = strlen(seq); + if (c->n_cigar && c->l_qseq != (int32_t)bam_cigar2qlen(c, bam1_cigar(b))) { + throw(MAL, "fill_bam_alig", + FILL_BAM_ALIG_ERR "CIGAR and sequence length inconsistency: %d (SEQ) vs %d (CIGAR)", + alignment_nr, c->l_qseq, (int32_t)bam_cigar2qlen(c, bam1_cigar(b))); + } + p = (uint8_t*)alloc_data(b, doff + c->l_qseq + (c->l_qseq+1)/2) + doff; + if(p == NULL) { + throw(MAL, "fill_bam_alig", + FILL_BAM_ALIG_ERR MAL_MALLOC_FAIL, alignment_nr); + } + memset(p, 0, (c->l_qseq+1)/2); + for (i = 0; i < c->l_qseq; ++i) + p[i/2] |= bam_nt16_table[(int)seq[i]] << 4*(1-i%2); + } else { + c->l_qseq = 0; + } + if (strcmp(qual, "*") != 0 && (size_t)c->l_qseq != strlen(qual)) { + throw(MAL, "fill_bam_alig", + FILL_BAM_ALIG_ERR "SEQ and QUAL are inconsistent", + alignment_nr); + } + p += (c->l_qseq+1)/2; + if (strcmp(qual, "*") == 0) { + for (i = 0; i < c->l_qseq; ++i) { + p[i] = 0xff; + } + } else { + for (i = 0; i < c->l_qseq; ++i) { + p[i] = qual[i] - 33; + } + } + doff += c->l_qseq + (c->l_qseq+1)/2; + } + b->data_len = doff; + if (bam_no_B) { + bam_remove_B(b); + } + + return MAL_SUCCEED; +} + + + + + +static str +bind_export_result(Client cntxt, MalBlkPtr mb, bam_field fields[11], int *tuple_count) +{ mvc *m; sql_schema *s_bam; sql_table *t_export; - bam_field fields[11]; - - stream *output = NULL; int cnt, i; - str msg = MAL_SUCCEED; - - memset(fields, 0, 11 * sizeof(bam_field)); - - if ((output = bsopen(output_path)) == NULL) { - msg = createException(MAL, "sam_export", "Could not open output file '%s' for writing", output_path); - goto cleanup; - } + str msg; if ((msg = getSQLContext(cntxt, mb, &m, NULL)) != MAL_SUCCEED) { - REUSE_EXCEPTION(msg, MAL, "sam_export", "Could not retrieve SQL context: %s", msg); - goto cleanup; + REUSE_EXCEPTION(msg, MAL, "bind_export_result", "Could not retrieve SQL context: %s", msg); + return msg; } if ((s_bam = mvc_bind_schema(m, "bam")) == NULL) { - msg = createException(MAL, "sam_export", "Could not find bam schema"); - goto cleanup; + throw(MAL, "bind_export_result", "Could not find bam schema"); } if ((t_export = mvc_bind_table(m, s_bam, "export")) == NULL) { - msg = createException(MAL, "sam_export", "Could not find bam.export table"); - goto cleanup; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list