Changeset: 84ebb8ecd572 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=84ebb8ecd572
Modified Files:
        sql/backends/monet5/bam/85_bam.sql
        sql/backends/monet5/bam/bam.mal
        sql/backends/monet5/bam/bam_lib.c
        sql/backends/monet5/bam/bam_lib.h
Branch: bamloader
Log Message:

Added UDF for calculating the character from an alignment that maps to some 
position in the reference string, given this position and the alignment pos, 
cigar and seq fields


diffs (182 lines):

diff --git a/sql/backends/monet5/bam/85_bam.sql 
b/sql/backends/monet5/bam/85_bam.sql
--- a/sql/backends/monet5/bam/85_bam.sql
+++ b/sql/backends/monet5/bam/85_bam.sql
@@ -25,6 +25,9 @@ RETURNS STRING EXTERNAL NAME bam.reverse
 CREATE FUNCTION bam.seq_length(cigar STRING)
 RETURNS INT EXTERNAL NAME bam.seq_length;
 
+CREATE FUNCTION bam.seq_char(ref_pos INT, alg_seq STRING, alg_pos INT, 
alg_cigar STRING)
+RETURNS CHAR(1) EXTERNAL NAME bam.seq_char;
+
 
 CREATE PROCEDURE bam.sam_export(output_path STRING)
 EXTERNAL NAME bam.sam_export;
diff --git a/sql/backends/monet5/bam/bam.mal b/sql/backends/monet5/bam/bam.mal
--- a/sql/backends/monet5/bam/bam.mal
+++ b/sql/backends/monet5/bam/bam.mal
@@ -38,6 +38,10 @@ command seq_length(cigar:str):int
 address seq_length
 comment "Calculate the real length of a DNA sequence, given its CIGAR string."
 
+command seq_char(ref_pos:int, alg_seq:str, alg_pos:int, alg_cigar:str):str
+address seq_char
+comment "Calculate the character in the alignment string (alg_str) that is 
aligned to position 'ref_pos', conforming to the given cigar string"
+
 
 # Export signatures
 
@@ -69,3 +73,7 @@ comment "Reverse a bat of DNA Quality st
 command seq_length(cigars:bat[:oid,:str]):bat[:oid,:int]
 address seq_length_bat
 comment "Calculate the real length of a bat of DNA sequences, given their 
CIGAR string."
+
+command seq_char(ref_pos:int, alg_seq:bat[:oid,:str], alg_pos:bat[:oid,:int], 
alg_cigar:bat[:oid,:str]):bat[:oid,:str]
+address seq_char
+comment "Calculate the character in the alignment string (alg_str) that is 
aligned to position 'ref_pos', conforming to the given cigar string (bat based 
version)"
diff --git a/sql/backends/monet5/bam/bam_lib.c 
b/sql/backends/monet5/bam/bam_lib.c
--- a/sql/backends/monet5/bam/bam_lib.c
+++ b/sql/backends/monet5/bam/bam_lib.c
@@ -141,7 +141,7 @@ reverse_qual(str * ret, str * qual)
 }
 
 str
-seq_length(int *ret, str * cigar)
+seq_length(int * ret, str * cigar)
 {
        int result = 0;
        str cigar_consumable = *cigar;
@@ -169,6 +169,52 @@ seq_length(int *ret, str * cigar)
        return MAL_SUCCEED;
 }
 
+str
+seq_char(str * ret, int * ref_pos, str * alg_seq, int * alg_pos, str * 
alg_cigar) 
+{
+       str cigar_consumable = *alg_cigar;
+       int seq_pos = 0;
+       bit at_character = false;
+       int iterate_until = *ref_pos - *alg_pos;
+       str result;
+
+       if((result = GDKmalloc(2 * sizeof(char))) == NULL) {
+               throw(MAL, "seq_char", MAL_MALLOC_FAIL);
+       }
+       result[1] = '\0';
+       
+       if (cigar_consumable[0] == '*' && cigar_consumable[1] == '\0') {
+               result[0] = '\0';
+               *ret = result;
+               return MAL_SUCCEED;
+       }
+       while(cigar_consumable[0] != '\0' && seq_pos < iterate_until) {
+               int cnt;
+               char op;
+               int nr_chars_read;
+
+               if (sscanf
+                       (cigar_consumable, "%d%c%n", &cnt, &op,
+                        &nr_chars_read) != 2)
+                       throw(MAL, "seq_char",
+                                 "Error parsing CIGAR string '%s'\n", 
*alg_cigar);
+               if (op == 'M' || op == 'D' || op == 'N' || op == '='
+                       || op == 'X') {
+                       seq_pos += cnt;
+                       if(seq_pos > iterate_until) 
+                               seq_pos = iterate_until;
+                       at_character = true;
+               } else {
+                       at_character = false;
+               }
+               cigar_consumable += nr_chars_read;
+       }
+       result[0] = at_character ? (*alg_seq)[seq_pos] : '\0';
+       *ret = result;
+       return MAL_SUCCEED;
+}
+
+
 
 
 str
@@ -335,3 +381,65 @@ seq_length_bat(bat * ret, bat * bid)
 
        return MAL_SUCCEED;
 }
+
+
+str
+seq_char_bat(bat * ret, int * ref_pos, bat * alg_seq, bat * alg_pos, bat * 
alg_cigar)
+{
+       BAT *seqs, *poss, *cigars, *result;
+       BUN seq = 0, pos = 0, cigar = 0, seq_end = 0;
+       BATiter seq_it, pos_it, cigar_it;
+
+       assert(ret != NULL && ref_pos != NULL && alg_seq != NULL && alg_pos != 
NULL && alg_cigar != NULL);
+
+       if ((seqs = BATdescriptor(*alg_seq)) == NULL ||
+           (poss = BATdescriptor(*alg_pos)) == NULL ||
+               (cigars = BATdescriptor(*alg_cigar)) == NULL) 
+               throw(MAL, "seq_char_bat", RUNTIME_OBJECT_MISSING);
+
+       if(BATcount(seqs) != BATcount(poss) || BATcount(seqs) != 
BATcount(cigars)) {
+               throw(MAL, "seq_char_bat", "Misalignment in input BATs");
+       }
+
+       /* allocate result BAT */
+       result = BATnew(TYPE_void, TYPE_str, BATcount(cigars), TRANSIENT);
+       if (result == NULL) {
+               throw(MAL, "seq_char_bat", MAL_MALLOC_FAIL);
+       }
+       BATseqbase(result, seqs->hseqbase);
+
+       seq = BUNfirst(seqs);
+       pos = BUNfirst(poss);
+       cigar = BUNfirst(cigars);
+       seq_end = BUNlast(seqs);
+
+       seq_it = bat_iterator(seqs);
+       pos_it = bat_iterator(poss);
+       cigar_it = bat_iterator(cigars);
+
+       while(seq < seq_end) {
+               str seq_val = (str) BUNtail(seq_it, seq);
+               int * pos_val = (int *) BUNtail(pos_it, pos);
+               str cigar_val = (str) BUNtail(cigar_it, cigar);
+               str r;
+               str msg;
+
+               if ((msg = seq_char(&r, ref_pos, &seq_val, pos_val, 
&cigar_val)) != MAL_SUCCEED) {
+                       BBPreleaseref(result->batCacheid);
+                       return msg;
+               }
+               BUNappend(result, (ptr) &r, FALSE);
+               ++seq;
+               ++pos;
+               ++cigar;
+       }
+
+       /* release input BAT-descriptors */
+       BBPreleaseref(seqs->batCacheid);
+       BBPreleaseref(poss->batCacheid);
+       BBPreleaseref(cigars->batCacheid);
+
+       BBPkeepref((*ret = result->batCacheid));
+
+       return MAL_SUCCEED;
+}
diff --git a/sql/backends/monet5/bam/bam_lib.h 
b/sql/backends/monet5/bam/bam_lib.h
--- a/sql/backends/monet5/bam/bam_lib.h
+++ b/sql/backends/monet5/bam/bam_lib.h
@@ -42,10 +42,12 @@ bam_export str bam_flag(bit * ret, sht *
 bam_export str reverse_seq(str * ret, str * seq);
 bam_export str reverse_qual(str * ret, str * qual);
 bam_export str seq_length(int *ret, str * cigar);
+bam_export str seq_char(str * ret, int * ref_pos, str * alg_seq, int * 
alg_pos, str * alg_cigar);
 
 bam_export str bam_flag_bat(bat * ret, bat * bid, str * name);
 bam_export str reverse_seq_bat(bat * ret, bat * bid);
 bam_export str reverse_qual_bat(bat * ret, bat * bid);
 bam_export str seq_length_bat(bat * ret, bat * bid);
+bam_export str seq_char_bat(bat * ret, int * ref_pos, bat * alg_seq, bat * 
alg_pos, bat * alg_cigar);
 
 #endif
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to