Created
December 8, 2010 16:38
-
-
Save indapa/733516 to your computer and use it in GitHub Desktop.
determine intervals in file2.bed not covered by the file1.bed, given a length file of sources
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from optparse import OptionParser | |
from bx.bitset import * | |
from bx.bitset_builders import * | |
def read_len( lengthfh ): | |
"""Read a 'LEN' file and return a mapping from chromosome to length""" | |
mapping = dict() | |
for line in lengthfh: | |
fields = line.split() | |
mapping[ fields[0] ] = int( fields[1] ) | |
return mapping | |
def main(): | |
""" determine intervals in file2.bed not covered by the file1.bed """ | |
usage = "usage: %prog --lenfile=LENGTHFILE file1.bed file2.bed\n\ndetermine intervals in file2.bed not covered by the file1.bed\n" | |
parser = OptionParser(usage) | |
parser.add_option("--lenfile", type="string", dest="lengthfile", help="file with source name and bp size, seperated by white space") | |
(options, args)=parser.parse_args() | |
try: | |
bed_first, bed_second = args | |
except: | |
print usage | |
exit(1) | |
try: | |
lenfh=open(options.lengthfile) | |
lens=read_len(lenfh) | |
except: | |
print usage | |
exit(1) | |
bitsets1 = binned_bitsets_from_file( open( bed_first ) ) #bitsets1 set to 1 for bp in the bed intervals for bedfile1 | |
bitsets2 = binned_bitsets_from_file( open( bed_second ) )#bitsets2 set to 1 for bp in the bed intervals for bedfile2 | |
bitsets = dict() | |
#invert the bitsets1, so that the bits set to 1 now indicate intervals that are *not* covered by bedfile1 | |
for chrom in lens: | |
if chrom in bitsets1: | |
bitsets1[chrom].invert() | |
len = lens[chrom] | |
end = 0 | |
while 1: | |
start = bitsets1[chrom].next_set( end ) | |
if start == bitsets1[chrom].size: break | |
end = bitsets1[chrom].next_clear( start ) | |
if end > len: end = len | |
if end == len: break | |
else: | |
pass | |
#which intervals in bedfile2 are overlapping with the intervals not covered by bedfile1 | |
for key in bitsets1: | |
if key in bitsets2: | |
bitsets1[key].iand( bitsets2[key] ) | |
bitsets[key] = bitsets1[key] | |
for chrom in bitsets: | |
bits = bitsets[chrom] | |
end = 0 | |
while 1: | |
start = bits.next_set( end ) | |
if start == bits.size: break | |
end = bits.next_clear( start ) | |
print "%s\t%d\t%d" % ( chrom, start, end ) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment