Skip to content

Instantly share code, notes, and snippets.

@nh13
Created March 10, 2012 05:12
Show Gist options
  • Save nh13/2010255 to your computer and use it in GitHub Desktop.
Save nh13/2010255 to your computer and use it in GitHub Desktop.
SAM File Re-organizer
#!/usr/bin/env python
############################################
# SAM File Re-organizer
###########################################
#
# what if you took a SAM file and transposed the data,
# that is, make it so that the rows become columns and vice versa, even keeping
# optional tags? Would the file compress better?
#
# Since a BAM file stores "blocks" of data, and when it retrieves data, even for
# random retrieval, it must decompress a block, it should be able to transpose
# the columns to rows and vice versa. This could save some disk space I reckon.
# NB: samtools uses "bgzip" to compress the data.
#
# Results
#$ls -alh *sam *bam *gz *bz2 *txt
#-rw-r--r-- 1 user staff 5.1M Mar 10 00:00 in.bam
#-rw-r--r-- 1 user staff 37M Mar 10 00:00 in.sam
#-rw-r--r-- 1 user staff 4.8M Mar 10 00:07 in.sam.bgzip.gz
#-rw-r--r-- 1 user staff 3.8M Mar 10 00:08 in.sam.bzip2.bz2
#-rw-r--r-- 1 user staff 4.5M Mar 10 00:08 in.sam.gzip.gz
#-rw-r--r-- 1 user staff 34M Mar 10 00:01 out.txt
#-rw-r--r-- 1 user staff 3.8M Mar 10 00:02 out.txt.bgzip.gz
#-rw-r--r-- 1 user staff 3.8M Mar 10 00:03 out.txt.bzip2.bz2
#-rw-r--r-- 1 user staff 3.6M Mar 10 00:03 out.txt.gzip.gz
import sys
from optparse import OptionParser
class Data:
tag_col = 11
empty_data = "NA"
def __init__(self):
self.data = {}
self.opt_tags = {}
self.tags = []
self.n = 0
for i in xrange(self.tag_col):
self.data[i] = list()
self.tags.append(i)
def add(self, line):
tokens = line.split('\t')
# standard data
for i in xrange(self.tag_col):
self.data[i].append(tokens[i])
# optional tags
local_data = {}
for i in xrange(self.tag_col, len(tokens)):
tok = tokens[i]
tag = tok[:4] # include type (ex. RG:Z)
value = tok[5:]
local_data[tag] = value
if not tag in self.data:
self.opt_tags[tag] = True
self.tags.append(tag)
self.data[tag] = list()
for j in xrange(self.n):
self.data[tag].append(self.empty_data) # TODO: is NA safe?
for tag in self.opt_tags:
if tag in local_data:
self.data[tag].append(local_data[tag])
else:
self.data[tag].append(self.empty_data)
self.n += 1
def print_out(self):
for i in xrange(len(self.tags)):
tag = self.tags[i]
sys.stdout.write(str(tag))
# NB: this could take a lot of memory
print "\t".join(self.data[tag])
def main(options):
fh = open(options.input, 'r')
data = Data();
for line in fh:
line = line.rstrip("\r\n")
if "@" == line[0]:
print line
else:
data.add(line)
data.print_out()
fh.close()
def check_option(parser, value, name):
if None == value:
print 'Option ' + name + ' required.\n'
parser.print_help()
sys.exit(1)
if __name__ == '__main__':
parser = OptionParser()
parser.add_option('-i', '--input', dest='input', default=None, help="Input SAM file")
(options, args) = parser.parse_args()
if len(args) != 0:
parser.print_help()
sys.exit(1)
check_option(parser, options.input, '-i')
main(options)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment