Skip to content

Instantly share code, notes, and snippets.

@nhoffman
Last active December 31, 2015 21:09
Show Gist options
  • Save nhoffman/8045302 to your computer and use it in GitHub Desktop.
Save nhoffman/8045302 to your computer and use it in GitHub Desktop.
Filter a file of genomic positions given positions specified in a bed file
#!/bin/env/python
"""Filter a file of genomic positions given ranges of start positions
specified in a bed file - totally untested.
"""
from collections import defaultdict
from operator import itemgetter
import csv
import sys
def coords(row, chromosome=0, start=1, stop=2):
"""Return chromosome, start, end - assuming that column indices are
the same in bedfile and giantfile.
"""
return row[chromosome], int(row[start]), int(row[stop])
def main():
bedfile, giantfile, outfile = sys.argv[1:4]
# prepare a dictionary of chromosome: set(positions)
ranges = defaultdict(set)
with open(bedfile) as f:
for row in csv.reader(f):
chr, beg, end = coords(row)
ranges[chr].update(range(beg, end + 1))
# now we can filter
with open(giantfile) as g, open(outfile, 'w') as o:
writer = csv.writer(o)
# is the start position among the allowed posisions for this
# chromosome?
for row in csv.reader(g):
chr, beg, _ = coords(row)
if beg in ranges[chr]:
writer.writerow(row)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment