Created
March 10, 2012 05:12
-
-
Save nh13/2010255 to your computer and use it in GitHub Desktop.
SAM File Re-organizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
############################################ | |
# SAM File Re-organizer | |
########################################### | |
# | |
# what if you took a SAM file and transposed the data, | |
# that is, make it so that the rows become columns and vice versa, even keeping | |
# optional tags? Would the file compress better? | |
# | |
# Since a BAM file stores "blocks" of data, and when it retrieves data, even for | |
# random retrieval, it must decompress a block, it should be able to transpose | |
# the columns to rows and vice versa. This could save some disk space I reckon. | |
# NB: samtools uses "bgzip" to compress the data. | |
# | |
# Results | |
#$ls -alh *sam *bam *gz *bz2 *txt | |
#-rw-r--r-- 1 user staff 5.1M Mar 10 00:00 in.bam | |
#-rw-r--r-- 1 user staff 37M Mar 10 00:00 in.sam | |
#-rw-r--r-- 1 user staff 4.8M Mar 10 00:07 in.sam.bgzip.gz | |
#-rw-r--r-- 1 user staff 3.8M Mar 10 00:08 in.sam.bzip2.bz2 | |
#-rw-r--r-- 1 user staff 4.5M Mar 10 00:08 in.sam.gzip.gz | |
#-rw-r--r-- 1 user staff 34M Mar 10 00:01 out.txt | |
#-rw-r--r-- 1 user staff 3.8M Mar 10 00:02 out.txt.bgzip.gz | |
#-rw-r--r-- 1 user staff 3.8M Mar 10 00:03 out.txt.bzip2.bz2 | |
#-rw-r--r-- 1 user staff 3.6M Mar 10 00:03 out.txt.gzip.gz | |
import sys | |
from optparse import OptionParser | |
class Data: | |
tag_col = 11 | |
empty_data = "NA" | |
def __init__(self): | |
self.data = {} | |
self.opt_tags = {} | |
self.tags = [] | |
self.n = 0 | |
for i in xrange(self.tag_col): | |
self.data[i] = list() | |
self.tags.append(i) | |
def add(self, line): | |
tokens = line.split('\t') | |
# standard data | |
for i in xrange(self.tag_col): | |
self.data[i].append(tokens[i]) | |
# optional tags | |
local_data = {} | |
for i in xrange(self.tag_col, len(tokens)): | |
tok = tokens[i] | |
tag = tok[:4] # include type (ex. RG:Z) | |
value = tok[5:] | |
local_data[tag] = value | |
if not tag in self.data: | |
self.opt_tags[tag] = True | |
self.tags.append(tag) | |
self.data[tag] = list() | |
for j in xrange(self.n): | |
self.data[tag].append(self.empty_data) # TODO: is NA safe? | |
for tag in self.opt_tags: | |
if tag in local_data: | |
self.data[tag].append(local_data[tag]) | |
else: | |
self.data[tag].append(self.empty_data) | |
self.n += 1 | |
def print_out(self): | |
for i in xrange(len(self.tags)): | |
tag = self.tags[i] | |
sys.stdout.write(str(tag)) | |
# NB: this could take a lot of memory | |
print "\t".join(self.data[tag]) | |
def main(options): | |
fh = open(options.input, 'r') | |
data = Data(); | |
for line in fh: | |
line = line.rstrip("\r\n") | |
if "@" == line[0]: | |
print line | |
else: | |
data.add(line) | |
data.print_out() | |
fh.close() | |
def check_option(parser, value, name): | |
if None == value: | |
print 'Option ' + name + ' required.\n' | |
parser.print_help() | |
sys.exit(1) | |
if __name__ == '__main__': | |
parser = OptionParser() | |
parser.add_option('-i', '--input', dest='input', default=None, help="Input SAM file") | |
(options, args) = parser.parse_args() | |
if len(args) != 0: | |
parser.print_help() | |
sys.exit(1) | |
check_option(parser, options.input, '-i') | |
main(options) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment