Last active
April 4, 2022 20:54
-
-
Save sminot/cebfbd84d57406b5b41b2eebffb1789f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import click | |
import os | |
import pandas as pd | |
# Set up the command line processor | |
@click.command() | |
@click.option("--details-hdf5", help="Provide the path to the geneshot output file containing all gene abundances (*.details.hdf5)") | |
@click.option("--gene-name", help="Name of the gene to extract", prompt="Gene Name") | |
def geneshot_extract_gene_abund(details_hdf5, gene_name): | |
assert details_hdf5 is not None, "Must provide path to the geneshot output file containing all gene abundances (*.details.hdf5)" | |
assert os.path.exists(details_hdf5), f"Cannot file file: {details_hdf5}" | |
assert gene_name is not None, "Must provide gene name to extract" | |
# Collect all of the outputs in one place | |
dat = [] | |
# Open the file | |
with pd.HDFStore(details_hdf5, "r") as store: | |
# Iterate over every table in the store | |
for k in store: | |
# If this is an abundance table | |
if k.startswith("/abund/gene/long/"): | |
# Get the specimen name | |
specimen = k[len("/abund/gene/long/"):] | |
# Add it to the data, filtering to the gene of interest | |
print(f"Reading {k}") | |
df = pd.read_hdf(store, k) | |
tot_reads = df['nreads'].sum() | |
dat.append(df.query(f"id == '{gene_name}'").assign(specimen=specimen, prop_reads=lambda d: d['nreads'] / tot_reads)) | |
# Combine all of the data | |
df = pd.concat(dat).reset_index(drop=True) | |
# Write it out as CSV | |
df.to_csv(f"{gene_name}.geneshot.csv", index=None) | |
if __name__ == "__main__": | |
geneshot_extract_gene_abund() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment