Last active
November 3, 2017 23:57
-
-
Save rvprasad/1dd4a5466d528829b8d01c037d95913b to your computer and use it in GitHub Desktop.
Maps each alignment (in BAM) to reference gene sequence data (in GFF) and its description (in Description file).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python2.7 | |
# | |
# Before using the script, execute the following. | |
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip intervaltree | |
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip plac | |
# | |
# To run the script, use the following command | |
# PYTHONPATH=~/.pip python2.7 mapper-par.py <desc> <gff> <bam> <output> | |
# <# cores> | |
from __future__ import print_function | |
from datetime import datetime | |
import multiprocessing | |
import plac | |
import pybam # https://github.com/JohnLonginotto/pybam | |
from mapper import * | |
def worker(refs, alignments, results): | |
reference = refs.get() | |
refs.task_done() | |
while True: | |
alignment = alignments.get() | |
alignments.task_done() | |
output = process_alignment(reference, *alignment) | |
results.put(output) | |
def process_alignments_par(bam_filename, output_filename, reference, | |
num_of_cores): | |
print('Reading alignment file ' + bam_filename) | |
refs = multiprocessing.JoinableQueue() | |
alignments = multiprocessing.JoinableQueue() | |
results = multiprocessing.JoinableQueue() | |
for _ in range(1, num_of_cores): | |
proc = multiprocessing.Process(target=worker, | |
args=(refs, alignments, results)) | |
proc.start() | |
refs.put(reference) | |
refs.close() | |
with open(output_filename, 'wt') as f: | |
f.write('QName StartPos EndPos RName ID Parent StartPos \ | |
EndPos Name Description\n'.replace(' ', '\t')) | |
lines = 0 | |
limit = num_of_cores * 400 | |
print("{0} lines processed {1}".format(lines, datetime.now())) | |
for sam_qname, sam_rname, sam_pos1, sam_seq in \ | |
pybam.read(bam_filename, ['sam_qname', 'sam_rname', | |
'sam_pos1', 'sam_seq']): | |
alignments.put([sam_qname, sam_rname, sam_pos1, len(sam_seq)]) | |
lines += 1 | |
if (lines % limit) == 0: | |
for _ in range(0, limit): | |
output = results.get() | |
results.task_done() | |
f.writelines(output) | |
print("processed {0} lines {1}".format(lines, datetime.now())) | |
alignments.close() | |
for _ in range(0, (lines % limit)): | |
output = results.get() | |
results.task_done() | |
f.writelines(output) | |
print("processed {0} lines {1}".format(lines, datetime.now())) | |
results.close() | |
def process(desc_filename, gff_filename, bam_filename, output_filename, | |
num_of_cores): | |
print('Reading description file ' + desc_filename) | |
id_to_desc = get_description(desc_filename) | |
reference = get_reference_tree(gff_filename, id_to_desc) | |
process_alignments_par(bam_filename, output_filename, reference, | |
int(num_of_cores)) | |
if __name__ == "__main__": | |
plac.call(process) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python2.7 | |
# | |
# Before using the script, execute the following. | |
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip intervaltree | |
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip plac | |
# | |
# To run the script, use the following command | |
# PYTHONPATH=~/.pip python2.7 mapper-pool.py <desc> <gff> <bam> <output> | |
# <# cores> | |
from __future__ import print_function | |
from datetime import datetime | |
import multiprocessing | |
import plac | |
import pybam # https://github.com/JohnLonginotto/pybam | |
from mapper import * | |
reference = None | |
def initializer(refs): | |
global reference | |
reference = refs | |
def process_alignment_wrapper(inputs): | |
return process_alignment(reference, *inputs) | |
def process_alignments_par(bam_filename, output_filename, reference, | |
num_of_cores): | |
print('Reading alignment file ' + bam_filename) | |
pool = multiprocessing.Pool(num_of_cores, initializer, (reference,)) | |
with open(output_filename, 'wt') as f: | |
f.write('QName StartPos EndPos RName ID Parent StartPos \ | |
EndPos Name Description\n'.replace(' ', '\t')) | |
chunk_size = 400 | |
lines = 0 | |
alignments = [] | |
limit = num_of_cores * chunk_size | |
print("{0} lines processed {1}".format(lines, datetime.now())) | |
for sam_qname, sam_rname, sam_pos1, sam_seq in \ | |
pybam.read(bam_filename, ['sam_qname', 'sam_rname', | |
'sam_pos1', 'sam_seq']): | |
alignments.append([sam_qname, sam_rname, sam_pos1, len(sam_seq)]) | |
lines += 1 | |
if len(alignments) == limit: | |
output = [x for y in | |
pool.imap_unordered(process_alignment_wrapper, | |
alignments, chunk_size) for x in | |
y] | |
f.writelines(output) | |
print("processed {0} lines {1}".format(lines, datetime.now())) | |
alignments = [] | |
output = pool.imap_unordered(process_alignments_wrapper, alignments) | |
f.writelines(output) | |
print("processed {0} lines {1}".format(lines, datetime.now())) | |
pool.close() | |
def process(desc_filename, gff_filename, bam_filename, output_filename, | |
num_of_cores): | |
print('Reading description file ' + desc_filename) | |
id_to_desc = get_description(desc_filename) | |
reference = get_reference_tree(gff_filename, id_to_desc) | |
process_alignments_par(bam_filename, output_filename, reference, | |
int(num_of_cores)) | |
if __name__ == "__main__": | |
plac.call(process) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python2.7 | |
# | |
# Before using the script, execute the following. | |
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip intervaltree | |
# PYTHONPATH=~/.pip easy_install --install-dir=~/.pip plac | |
# | |
# To run the script, use the following command | |
# PYTHONPATH=~/.pip python2.7 mapper-seq.py <desc> <gff> <bam> <output> | |
from __future__ import print_function | |
from datetime import datetime | |
import plac | |
import pybam # https://github.com/JohnLonginotto/pybam | |
from mapper import * | |
def process_alignments_seq(bam_filename, output_filename, reference): | |
print('Reading alignment file ' + bam_filename) | |
with open(output_filename, 'wt') as f: | |
f.write('QName StartPos EndPos RName ID Parent StartPos \ | |
EndPos Name Description\n'.replace(' ', '\t')) | |
lines = 0 | |
print("{0} lines processed {1}".format(lines, datetime.now())) | |
for sam_qname, sam_rname, sam_pos1, sam_seq in \ | |
pybam.read(bam_filename, ['sam_qname', 'sam_rname', | |
'sam_pos1', 'sam_seq']): | |
output = process_alignment(reference, sam_qname, sam_rname, | |
sam_pos1, len(sam_seq)) | |
f.writelines(output) | |
lines += 1 | |
if (lines % 1000) == 0: | |
print("{0} lines processed {1}".format(lines, datetime.now())) | |
def process(desc_filename, gff_filename, bam_filename, output_filename): | |
print('Reading description file ' + desc_filename) | |
id_to_desc = get_description(desc_filename) | |
reference = get_reference_tree(gff_filename, id_to_desc) | |
process_alignments_seq(bam_filename, output_filename, reference) | |
if __name__ == "__main__": | |
plac.call(process) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# python2.7 | |
from __future__ import print_function | |
import itertools | |
import intervaltree | |
import pybam # https://github.com/JohnLonginotto/pybam | |
def get_reference_tree(gff_filename, id_to_desc): | |
print('Reading reference file ' + gff_filename) | |
reference = intervaltree.IntervalTree() | |
i = 0 | |
with open(gff_filename, 'rt') as f: | |
for line in f: | |
if line[0] != '#': | |
tmp1 = line.split('\t') | |
tmp2 = {z[0]: z[1] for z in (map(lambda x: x.strip(), | |
x.split('=')) | |
for x in tmp1[8].split(';'))} | |
value = {x: tmp2[x] for x in set(['ID', 'Parent', 'Name']) | |
.intersection(tmp2.keys())} | |
value['StartPos'] = tmp1[3] | |
value['EndPos'] = tmp1[4] | |
if 'ID' in value: | |
tmp3 = value['ID'].split('.')[0] | |
if '-' not in tmp3: | |
tmp3 = tmp3 + '-RA' | |
value['Description'] = id_to_desc[tmp3] | |
tmp3 = (tmp1[0], '\t'.join(value.get(k, '') for k in | |
['ID', 'Parent', 'StartPos', | |
'EndPos', 'Name', 'Description'])) | |
reference[int(tmp1[3]):(float(tmp1[4]) + 0.000001)] = tmp3 | |
return reference | |
def get_description(desc_filename): | |
id_to_desc = {} | |
with open(desc_filename, 'rt') as f: | |
for line in f: | |
tmp1 = line.split('\t') | |
id_to_desc[tmp1[0]] = '\t'.join(map(lambda x: x.strip(), tmp1[1:])) | |
return id_to_desc | |
def process_alignment(reference, sam_qname, sam_rname, sam_pos1, sam_seq_len): | |
# get annotation for alignment.sam_pos1 | |
endPos = sam_pos1 + sam_seq_len - 1 | |
tmp1 = set(itertools.chain.from_iterable(reference[x] | |
for x in xrange(sam_pos1, | |
endPos + 1))) | |
tmp2 = list(x for x in tmp1 if x.data[0] == sam_rname) | |
tmp3 = (x.data[1] for x in sorted(tmp2, key=lambda x: x.end - x.begin)) | |
output = ['\t'.join(map(str, [sam_qname, sam_pos1, endPos, sam_rname, t])) | |
+ '\n' for t in tmp3] | |
return output |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Use either mapper-par.py, mapper-pool.py, or mapper-seq.py at the command line.