Skip to content

Instantly share code, notes, and snippets.

@daler
Created August 14, 2014 12:04
Show Gist options
  • Save daler/99cdf714d438e161dad6 to your computer and use it in GitHub Desktop.
Save daler/99cdf714d438e161dad6 to your computer and use it in GitHub Desktop.
pybedtools issue #110
import pybedtools
import pandas
def split_coverage(x):
"""
Split a coverage file created using bedtools coverage -hist -- which will
have trailing "all" hist lines -- into 1) a BedTool object with valid BED
lines and 2) a pandas DataFrame of all coverage, parsed from the trailing
"all" lines.
`x` can be a filename or a BedTool instance.
"""
if isinstance(x, basestring):
fn = x
else:
fn = x.fn
f = open(fn)
hist_lines = []
def gen():
"""
Generator that yields only valid BED lines and then stops.
The first "all" line is appended to hist_lines.
"""
while True:
line = f.next()
toks = line.strip().split('\t')
if toks[0] == 'all':
# Don't need to include the "all" text in the first field.
hist_lines.append(toks[1:])
break
yield pybedtools.create_interval_from_list(toks)
# Create a BedTool from the generator, and call `saveas` to consume the
# generator. We need this so that the file pointer will stop right after
# the first "all" line.
b = pybedtools.BedTool(gen()).saveas()
# Pick up where we left off in the open file, appending to hist_lines.
while True:
try:
line = f.next()
except StopIteration:
break
hist_lines.append(line.strip().split('\t')[1:])
df = pandas.DataFrame(
hist_lines,
columns=['depth', 'count', 'size', 'percent'])
return b, df
# Example usage --------------------------------------------------------------
#
a = pybedtools.example_bedtool('x.bed')
regions = pybedtools.BedTool(
"""
chr2L 1000 10000
chr2L 12000 15000
chr2L 20000 30000
""", from_string=True)
coverage = a.coverage(regions, hist=True)
# Call it like this
new_bedtool, dataframe = split_coverage(coverage)
# ensure we can iterate through `b`. This is what triggered an error when
# using a non-split coverage result and the whole reason for needing
# a split_coverage function:
for _ in new_bedtool:
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment