alyssafrazee · March 27, 2013 15:50
diff --git a/countReadlets.py b/countReadlets.py
 #!/usr/bin/python

 ### readlet counting script

 ### helper functions:
 def enum(collection,st):
    #just like "enumerate" but you define your own starting position.
    #this returns indices RELATIVE TO ORIGINAL LIST
    i = st
    while i < len(collection):
        yield (i,collection[i])
        i += 1

 def getfirstindex(L,st,value,K):
    for pos,t in enum(L,st):
        if t[1] > value-K:
            return pos #returns first read in the list that CAN contain given bp
    return 0

 def getlastindex(L,st,value):
    for pos,t in enum(L,st):
        if t[1] > value:
            return pos #returns first read in the list that CANNOT contain given bp
    return len(L)

 # previous 2 helper functions were inspired by code here: http://stackoverflow.com/questions/946860/using-pythons-list-index-method-on-a-list-of-tuples-or-objects (see answer labeled "10", superperformant)


 ### the main function:
 def countReadlets(fname,outfname,k,chromosome):
    import pysam
    samfile = pysam.Samfile(fname,"rb") # parse the .bam (sequence alignment) file
    id_start_end = []
    maxpos = 0
    minpos = 3000000000
    for read in samfile.fetch(chromosome):
        id_start_end.append([read.qname, read.pos+1, read.aend]) #id_start_end will be a list of readlet alignments, containing read ID (readlets coming from same read have same ID), alignment start position, and alignment end position
        if read.aend > maxpos:
            maxpos = read.aend
        if read.pos+1 < minpos:
            minpos = read.pos+1
    f = open(outfname, 'w')
    first = 0
    last = 0
    for z in xrange(1,minpos):
        f.write("%s\t%s\n" % (z,0))
    for i in xrange(minpos, maxpos+1):
        last = getlastindex(id_start_end,first,i)
        first = getfirstindex(id_start_end,first,i,k) #k = 25 for 25-character readlets
        if first == last:
            f.write("%s\t%s\n" % (i,0))
            continue
        overlaps = id_start_end[first:last]
        readnames = set()
        for j in xrange(len(overlaps)):
            readnames.add(overlaps[j][0])
        numreads = len(readnames) #count readlets only once per read
        f.write("%s\t%s\n" % (i,numreads))
    f.close()
    return None

 ### get arguments from command line
 ### use: python countReadlets.py --file myfile.bam --output outfile.txt --kmer 100 --chrom 22
 from optparse import OptionParser
 opts = OptionParser()
 opts.add_option("--file","-f",type="string",help="input file name (must be .bam, and must be indexed with samtools first)")
 opts.add_option("--output","-o",type="string",help="output file name")
 opts.add_option("--kmer","-k",type="int",help="kmer length")
 opts.add_option("--chrom","-c",type="string",help="chromosome to parse")
 options,arguments = opts.parse_args()


 countReadlets(options.file,options.output,options.kmer,options.chrom)
	#!/usr/bin/python

	### readlet counting script

	### helper functions:
	def enum(collection,st):
	#just like "enumerate" but you define your own starting position.
	#this returns indices RELATIVE TO ORIGINAL LIST
	i = st
	while i < len(collection):
	yield (i,collection[i])
	i += 1

	def getfirstindex(L,st,value,K):
	for pos,t in enum(L,st):
	if t[1] > value-K:
	return pos #returns first read in the list that CAN contain given bp
	return 0

	def getlastindex(L,st,value):
	for pos,t in enum(L,st):
	if t[1] > value:
	return pos #returns first read in the list that CANNOT contain given bp
	return len(L)

	# previous 2 helper functions were inspired by code here: http://stackoverflow.com/questions/946860/using-pythons-list-index-method-on-a-list-of-tuples-or-objects (see answer labeled "10", superperformant)


	### the main function:
	def countReadlets(fname,outfname,k,chromosome):
	import pysam
	samfile = pysam.Samfile(fname,"rb") # parse the .bam (sequence alignment) file
	id_start_end = []
	maxpos = 0
	minpos = 3000000000
	for read in samfile.fetch(chromosome):
	id_start_end.append([read.qname, read.pos+1, read.aend]) #id_start_end will be a list of readlet alignments, containing read ID (readlets coming from same read have same ID), alignment start position, and alignment end position
	if read.aend > maxpos:
	maxpos = read.aend
	if read.pos+1 < minpos:
	minpos = read.pos+1
	f = open(outfname, 'w')
	first = 0
	last = 0
	for z in xrange(1,minpos):
	f.write("%s\t%s\n" % (z,0))
	for i in xrange(minpos, maxpos+1):
	last = getlastindex(id_start_end,first,i)
	first = getfirstindex(id_start_end,first,i,k) #k = 25 for 25-character readlets
	if first == last:
	f.write("%s\t%s\n" % (i,0))
	continue
	overlaps = id_start_end[first:last]
	readnames = set()
	for j in xrange(len(overlaps)):
	readnames.add(overlaps[j][0])
	numreads = len(readnames) #count readlets only once per read
	f.write("%s\t%s\n" % (i,numreads))
	f.close()
	return None

	### get arguments from command line
	### use: python countReadlets.py --file myfile.bam --output outfile.txt --kmer 100 --chrom 22
	from optparse import OptionParser
	opts = OptionParser()
	opts.add_option("--file","-f",type="string",help="input file name (must be .bam, and must be indexed with samtools first)")
	opts.add_option("--output","-o",type="string",help="output file name")
	opts.add_option("--kmer","-k",type="int",help="kmer length")
	opts.add_option("--chrom","-c",type="string",help="chromosome to parse")
	options,arguments = opts.parse_args()


	countReadlets(options.file,options.output,options.kmer,options.chrom)