Created
August 14, 2014 12:04
-
-
Save daler/99cdf714d438e161dad6 to your computer and use it in GitHub Desktop.
pybedtools issue #110
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pybedtools | |
import pandas | |
def split_coverage(x): | |
""" | |
Split a coverage file created using bedtools coverage -hist -- which will | |
have trailing "all" hist lines -- into 1) a BedTool object with valid BED | |
lines and 2) a pandas DataFrame of all coverage, parsed from the trailing | |
"all" lines. | |
`x` can be a filename or a BedTool instance. | |
""" | |
if isinstance(x, basestring): | |
fn = x | |
else: | |
fn = x.fn | |
f = open(fn) | |
hist_lines = [] | |
def gen(): | |
""" | |
Generator that yields only valid BED lines and then stops. | |
The first "all" line is appended to hist_lines. | |
""" | |
while True: | |
line = f.next() | |
toks = line.strip().split('\t') | |
if toks[0] == 'all': | |
# Don't need to include the "all" text in the first field. | |
hist_lines.append(toks[1:]) | |
break | |
yield pybedtools.create_interval_from_list(toks) | |
# Create a BedTool from the generator, and call `saveas` to consume the | |
# generator. We need this so that the file pointer will stop right after | |
# the first "all" line. | |
b = pybedtools.BedTool(gen()).saveas() | |
# Pick up where we left off in the open file, appending to hist_lines. | |
while True: | |
try: | |
line = f.next() | |
except StopIteration: | |
break | |
hist_lines.append(line.strip().split('\t')[1:]) | |
df = pandas.DataFrame( | |
hist_lines, | |
columns=['depth', 'count', 'size', 'percent']) | |
return b, df | |
# Example usage -------------------------------------------------------------- | |
# | |
a = pybedtools.example_bedtool('x.bed') | |
regions = pybedtools.BedTool( | |
""" | |
chr2L 1000 10000 | |
chr2L 12000 15000 | |
chr2L 20000 30000 | |
""", from_string=True) | |
coverage = a.coverage(regions, hist=True) | |
# Call it like this | |
new_bedtool, dataframe = split_coverage(coverage) | |
# ensure we can iterate through `b`. This is what triggered an error when | |
# using a non-split coverage result and the whole reason for needing | |
# a split_coverage function: | |
for _ in new_bedtool: | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment