Changeset: 84ebb8ecd572 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=84ebb8ecd572 Modified Files: sql/backends/monet5/bam/85_bam.sql sql/backends/monet5/bam/bam.mal sql/backends/monet5/bam/bam_lib.c sql/backends/monet5/bam/bam_lib.h Branch: bamloader Log Message:
Added UDF for calculating the character from an alignment that maps to some position in the reference string, given this position and the alignment pos, cigar and seq fields diffs (182 lines): diff --git a/sql/backends/monet5/bam/85_bam.sql b/sql/backends/monet5/bam/85_bam.sql --- a/sql/backends/monet5/bam/85_bam.sql +++ b/sql/backends/monet5/bam/85_bam.sql @@ -25,6 +25,9 @@ RETURNS STRING EXTERNAL NAME bam.reverse CREATE FUNCTION bam.seq_length(cigar STRING) RETURNS INT EXTERNAL NAME bam.seq_length; +CREATE FUNCTION bam.seq_char(ref_pos INT, alg_seq STRING, alg_pos INT, alg_cigar STRING) +RETURNS CHAR(1) EXTERNAL NAME bam.seq_char; + CREATE PROCEDURE bam.sam_export(output_path STRING) EXTERNAL NAME bam.sam_export; diff --git a/sql/backends/monet5/bam/bam.mal b/sql/backends/monet5/bam/bam.mal --- a/sql/backends/monet5/bam/bam.mal +++ b/sql/backends/monet5/bam/bam.mal @@ -38,6 +38,10 @@ command seq_length(cigar:str):int address seq_length comment "Calculate the real length of a DNA sequence, given its CIGAR string." +command seq_char(ref_pos:int, alg_seq:str, alg_pos:int, alg_cigar:str):str +address seq_char +comment "Calculate the character in the alignment string (alg_str) that is aligned to position 'ref_pos', conforming to the given cigar string" + # Export signatures @@ -69,3 +73,7 @@ comment "Reverse a bat of DNA Quality st command seq_length(cigars:bat[:oid,:str]):bat[:oid,:int] address seq_length_bat comment "Calculate the real length of a bat of DNA sequences, given their CIGAR string." + +command seq_char(ref_pos:int, alg_seq:bat[:oid,:str], alg_pos:bat[:oid,:int], alg_cigar:bat[:oid,:str]):bat[:oid,:str] +address seq_char +comment "Calculate the character in the alignment string (alg_str) that is aligned to position 'ref_pos', conforming to the given cigar string (bat based version)" diff --git a/sql/backends/monet5/bam/bam_lib.c b/sql/backends/monet5/bam/bam_lib.c --- a/sql/backends/monet5/bam/bam_lib.c +++ b/sql/backends/monet5/bam/bam_lib.c @@ -141,7 +141,7 @@ reverse_qual(str * ret, str * qual) } str -seq_length(int *ret, str * cigar) +seq_length(int * ret, str * cigar) { int result = 0; str cigar_consumable = *cigar; @@ -169,6 +169,52 @@ seq_length(int *ret, str * cigar) return MAL_SUCCEED; } +str +seq_char(str * ret, int * ref_pos, str * alg_seq, int * alg_pos, str * alg_cigar) +{ + str cigar_consumable = *alg_cigar; + int seq_pos = 0; + bit at_character = false; + int iterate_until = *ref_pos - *alg_pos; + str result; + + if((result = GDKmalloc(2 * sizeof(char))) == NULL) { + throw(MAL, "seq_char", MAL_MALLOC_FAIL); + } + result[1] = '\0'; + + if (cigar_consumable[0] == '*' && cigar_consumable[1] == '\0') { + result[0] = '\0'; + *ret = result; + return MAL_SUCCEED; + } + while(cigar_consumable[0] != '\0' && seq_pos < iterate_until) { + int cnt; + char op; + int nr_chars_read; + + if (sscanf + (cigar_consumable, "%d%c%n", &cnt, &op, + &nr_chars_read) != 2) + throw(MAL, "seq_char", + "Error parsing CIGAR string '%s'\n", *alg_cigar); + if (op == 'M' || op == 'D' || op == 'N' || op == '=' + || op == 'X') { + seq_pos += cnt; + if(seq_pos > iterate_until) + seq_pos = iterate_until; + at_character = true; + } else { + at_character = false; + } + cigar_consumable += nr_chars_read; + } + result[0] = at_character ? (*alg_seq)[seq_pos] : '\0'; + *ret = result; + return MAL_SUCCEED; +} + + str @@ -335,3 +381,65 @@ seq_length_bat(bat * ret, bat * bid) return MAL_SUCCEED; } + + +str +seq_char_bat(bat * ret, int * ref_pos, bat * alg_seq, bat * alg_pos, bat * alg_cigar) +{ + BAT *seqs, *poss, *cigars, *result; + BUN seq = 0, pos = 0, cigar = 0, seq_end = 0; + BATiter seq_it, pos_it, cigar_it; + + assert(ret != NULL && ref_pos != NULL && alg_seq != NULL && alg_pos != NULL && alg_cigar != NULL); + + if ((seqs = BATdescriptor(*alg_seq)) == NULL || + (poss = BATdescriptor(*alg_pos)) == NULL || + (cigars = BATdescriptor(*alg_cigar)) == NULL) + throw(MAL, "seq_char_bat", RUNTIME_OBJECT_MISSING); + + if(BATcount(seqs) != BATcount(poss) || BATcount(seqs) != BATcount(cigars)) { + throw(MAL, "seq_char_bat", "Misalignment in input BATs"); + } + + /* allocate result BAT */ + result = BATnew(TYPE_void, TYPE_str, BATcount(cigars), TRANSIENT); + if (result == NULL) { + throw(MAL, "seq_char_bat", MAL_MALLOC_FAIL); + } + BATseqbase(result, seqs->hseqbase); + + seq = BUNfirst(seqs); + pos = BUNfirst(poss); + cigar = BUNfirst(cigars); + seq_end = BUNlast(seqs); + + seq_it = bat_iterator(seqs); + pos_it = bat_iterator(poss); + cigar_it = bat_iterator(cigars); + + while(seq < seq_end) { + str seq_val = (str) BUNtail(seq_it, seq); + int * pos_val = (int *) BUNtail(pos_it, pos); + str cigar_val = (str) BUNtail(cigar_it, cigar); + str r; + str msg; + + if ((msg = seq_char(&r, ref_pos, &seq_val, pos_val, &cigar_val)) != MAL_SUCCEED) { + BBPreleaseref(result->batCacheid); + return msg; + } + BUNappend(result, (ptr) &r, FALSE); + ++seq; + ++pos; + ++cigar; + } + + /* release input BAT-descriptors */ + BBPreleaseref(seqs->batCacheid); + BBPreleaseref(poss->batCacheid); + BBPreleaseref(cigars->batCacheid); + + BBPkeepref((*ret = result->batCacheid)); + + return MAL_SUCCEED; +} diff --git a/sql/backends/monet5/bam/bam_lib.h b/sql/backends/monet5/bam/bam_lib.h --- a/sql/backends/monet5/bam/bam_lib.h +++ b/sql/backends/monet5/bam/bam_lib.h @@ -42,10 +42,12 @@ bam_export str bam_flag(bit * ret, sht * bam_export str reverse_seq(str * ret, str * seq); bam_export str reverse_qual(str * ret, str * qual); bam_export str seq_length(int *ret, str * cigar); +bam_export str seq_char(str * ret, int * ref_pos, str * alg_seq, int * alg_pos, str * alg_cigar); bam_export str bam_flag_bat(bat * ret, bat * bid, str * name); bam_export str reverse_seq_bat(bat * ret, bat * bid); bam_export str reverse_qual_bat(bat * ret, bat * bid); bam_export str seq_length_bat(bat * ret, bat * bid); +bam_export str seq_char_bat(bat * ret, int * ref_pos, bat * alg_seq, bat * alg_pos, bat * alg_cigar); #endif _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list