Last active
December 31, 2015 21:09
-
-
Save nhoffman/8045302 to your computer and use it in GitHub Desktop.
Filter a file of genomic positions given positions specified in a bed file
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/env/python | |
"""Filter a file of genomic positions given ranges of start positions | |
specified in a bed file - totally untested. | |
""" | |
from collections import defaultdict | |
from operator import itemgetter | |
import csv | |
import sys | |
def coords(row, chromosome=0, start=1, stop=2): | |
"""Return chromosome, start, end - assuming that column indices are | |
the same in bedfile and giantfile. | |
""" | |
return row[chromosome], int(row[start]), int(row[stop]) | |
def main(): | |
bedfile, giantfile, outfile = sys.argv[1:4] | |
# prepare a dictionary of chromosome: set(positions) | |
ranges = defaultdict(set) | |
with open(bedfile) as f: | |
for row in csv.reader(f): | |
chr, beg, end = coords(row) | |
ranges[chr].update(range(beg, end + 1)) | |
# now we can filter | |
with open(giantfile) as g, open(outfile, 'w') as o: | |
writer = csv.writer(o) | |
# is the start position among the allowed posisions for this | |
# chromosome? | |
for row in csv.reader(g): | |
chr, beg, _ = coords(row) | |
if beg in ranges[chr]: | |
writer.writerow(row) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment