Hi, I was wondering I'm going about this in the correct way. I need to test if there are coding sequences or exons in hg19 which match a string of 100bp "D" i.e. [A,G or T]. However I'm getting a strange result.
I get a hit on chr7, using the 100bp search however when I search with 60bp sequence of "D" I don't get any hits. library("BSgenome") library("Biostrings") library("BSgenome.Hsapiens.UCSC.hg19") library("biomaRt") library("GenomicFeatures") #extract the alignments which match real genes #(add this onto stefans script req to buid the gr object using the regions from the BSgenome alignment of the 100bp seqs.) txdb <- makeTranscriptDbFromUCSC(genome = "hg19", tablename = "knownGene") ## do once locally & save #query.plus <- DNAString("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD") # 100bp C free sequence query.plus <- DNAString("DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD") # 60bp C free sequence query.minus <- reverseComplement(query.plus) chrList <- c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY") #access/group the by subset of annotation annotGr <- exons(txdb) #annotGr <- cds(txdb) wholeGenomeMatch <- function() { for(i in chrList) { #matches on plus strand matches.plus.strand <- matchPattern(query.plus, Hsapiens[[i]], fixed=FALSE, max.mismatch=5) mp <- as.matrix(matches.plus.strand) #matches on minus strand matches.minus.strand <- matchPattern(query.minus, Hsapiens[[i]], fixed=FALSE, max.mismatch=5) mm <- as.matrix(matches.minus.strand) OL.p <- NULL if(nrow(mp) > 0) { ##### MATCH THE POSITIVE STRAND ###### #need to get start and end positions (end = start + (length-1)) gr <- GRanges( seqnames = rep(i,nrow(mp)), ranges = IRanges(start = mp[[1]], end = mp[[1]]+(mp[,2]-1)), strand = rep("+", nrow(mp))) #OL <- findOverlaps(query=gr, subject=annotGr) OL.p <- annotGr[(!is.na(match(annotGr, gr)))] #as.data.frame(OL) ## view the results #tdata.p <- annotGr[unique(queryHits(OL)),] #tdata.p <- as.data.frame(tdata.p, row.names = NULL, optional = FALSE) #write.table(tdata.p, paste(i,"plus.txt")) cat( paste(i,"plus.txt\n")) } OL.m <- NULL if(nrow(mm) > 0) { ##### MATCH THE MINUS STRAND ######## #need to get start and end positions (end = start + (length-1)) gr <- GRanges( seqnames = rep(i,nrow(mm)), ranges = IRanges(start = mm[[1]], end = mm[[1]]+(mm[,2]-1)), strand = rep("-", nrow(mm))) #OL <- findOverlaps(query=gr, subject=annotGr) OL.m <- annotGr[(!is.na(match(annotGr, gr)))] #tdata.m <- annotGr[unique(queryHits(OL)),] #tdata.m <- as.data.frame(tdata.m, row.names = NULL, optional = FALSE) #write.table(tdata.m, paste(i,"minus.txt")) cat( paste(i,"minus.txt\n")) #write.table(rbind(tdata.p, tdata.p), paste(i, ".txt")) } ##write all matching results into one file write.table(rbind(as.data.frame(OL.p), as.data.frame(OL.m)), file="60bp_v_exons_max-mismatch=5.txt", append=TRUE) } } wholeGenomeMatch() #in the first instance with the 100bp search I get 1 hit in the output file: "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "1" "chr7" 420815 422845 2031 "+" 97277 "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" #with the 60 bp query string my output file reads no hits "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" "seqnames" "start" "end" "width" "strand" "exon_id" Your help is much appreciated, Amos [[alternative HTML version deleted]]
______________________________________________ R-help@r-project.org mailing list https://stat.ethz.ch/mailman/listinfo/r-help PLEASE do read the posting guide http://www.R-project.org/posting-guide.html and provide commented, minimal, self-contained, reproducible code.