Created
March 27, 2013 15:50
-
-
Save alyssafrazee/5255296 to your computer and use it in GitHub Desktop.
This code was written to solve a genomics problem. Some background: the genome is basically a very long character string consisting the characters A, C, T, and G. In genomic studies, the data usually consists of short "sequencing reads" (~100 character substrings of the genome), which we map back to their place of origin in the genome (the origi…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
### readlet counting script | |
### helper functions: | |
def enum(collection,st): | |
#just like "enumerate" but you define your own starting position. | |
#this returns indices RELATIVE TO ORIGINAL LIST | |
i = st | |
while i < len(collection): | |
yield (i,collection[i]) | |
i += 1 | |
def getfirstindex(L,st,value,K): | |
for pos,t in enum(L,st): | |
if t[1] > value-K: | |
return pos #returns first read in the list that CAN contain given bp | |
return 0 | |
def getlastindex(L,st,value): | |
for pos,t in enum(L,st): | |
if t[1] > value: | |
return pos #returns first read in the list that CANNOT contain given bp | |
return len(L) | |
# previous 2 helper functions were inspired by code here: http://stackoverflow.com/questions/946860/using-pythons-list-index-method-on-a-list-of-tuples-or-objects (see answer labeled "10", superperformant) | |
### the main function: | |
def countReadlets(fname,outfname,k,chromosome): | |
import pysam | |
samfile = pysam.Samfile(fname,"rb") # parse the .bam (sequence alignment) file | |
id_start_end = [] | |
maxpos = 0 | |
minpos = 3000000000 | |
for read in samfile.fetch(chromosome): | |
id_start_end.append([read.qname, read.pos+1, read.aend]) #id_start_end will be a list of readlet alignments, containing read ID (readlets coming from same read have same ID), alignment start position, and alignment end position | |
if read.aend > maxpos: | |
maxpos = read.aend | |
if read.pos+1 < minpos: | |
minpos = read.pos+1 | |
f = open(outfname, 'w') | |
first = 0 | |
last = 0 | |
for z in xrange(1,minpos): | |
f.write("%s\t%s\n" % (z,0)) | |
for i in xrange(minpos, maxpos+1): | |
last = getlastindex(id_start_end,first,i) | |
first = getfirstindex(id_start_end,first,i,k) #k = 25 for 25-character readlets | |
if first == last: | |
f.write("%s\t%s\n" % (i,0)) | |
continue | |
overlaps = id_start_end[first:last] | |
readnames = set() | |
for j in xrange(len(overlaps)): | |
readnames.add(overlaps[j][0]) | |
numreads = len(readnames) #count readlets only once per read | |
f.write("%s\t%s\n" % (i,numreads)) | |
f.close() | |
return None | |
### get arguments from command line | |
### use: python countReadlets.py --file myfile.bam --output outfile.txt --kmer 100 --chrom 22 | |
from optparse import OptionParser | |
opts = OptionParser() | |
opts.add_option("--file","-f",type="string",help="input file name (must be .bam, and must be indexed with samtools first)") | |
opts.add_option("--output","-o",type="string",help="output file name") | |
opts.add_option("--kmer","-k",type="int",help="kmer length") | |
opts.add_option("--chrom","-c",type="string",help="chromosome to parse") | |
options,arguments = opts.parse_args() | |
countReadlets(options.file,options.output,options.kmer,options.chrom) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment