Last active
October 26, 2016 22:42
-
-
Save pansapiens/0546a5a185fdb604f0aaa68da0425610 to your computer and use it in GitHub Desktop.
Given a counts table from featureCounts (subread) and a GTF/GFF FeatureDB database, output a table with a gene name column ("symbol") containing only counts from "protein_coding" exon features.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sys | |
import os | |
import argparse | |
import pandas as pd | |
import gffutils | |
def create_featuredb(gtf_file, output_file=':memory:'): | |
""" | |
Create an (sqlite) gffutils FeatureDB from a GTF/GFF file. | |
:param gtf_file: A GTF of GFF format feature file. | |
:type gtf_file: basestring | |
:param output_file: An output file path for the FeatureDB, or ':memory:' to | |
create the database in memory only. | |
:type output_file: basestring | |
:return: A FeatureDB object | |
:rtype: gffutils.FeatureDB | |
""" | |
db = gffutils.create_db(gtf_file, output_file) | |
return db | |
def annotate_combined_count_file(count_file, featuredb_file, out_file=None, | |
feature_type='exon', | |
feature_source='protein_coding'): | |
""" | |
Using a FeatureDB (sqlite), adds a gene name ('symbol') column to a | |
featureCounts table (tsv) and by default outputs only rows for 'exon' | |
features that are of source 'protein_coding'. | |
:param count_file: | |
:type count_file: basestring | |
:param featuredb_file: | |
:type featuredb_file: basestring | |
:param out_file: | |
:type out_file: basestring | |
:param feature_type: | |
:type feature_type: basestring | |
:param feature_source: | |
:type feature_source: basestring | |
:return: The filename of the output file. None if outputting to stdout. | |
:rtype: basestring | |
""" | |
db = gffutils.FeatureDB(featuredb_file, keep_order=True) | |
# if the genes don't have a gene_id or gene_name set | |
# a KeyError will be raised | |
symbol_lookup = dict() | |
type_lookup = dict() | |
for f in db.features_of_type(feature_type): | |
if f.source == feature_source: | |
symbol_lookup[f['gene_id'][0]] = f['gene_name'][0] | |
type_lookup[f['gene_id'][0]] = f.source | |
df = pd.io.parsers.read_table(count_file, sep="\t", index_col=0, header=0) | |
df['symbol'] = df.apply(lambda x: symbol_lookup.get(x.name, ""), axis=1) | |
df['type'] = df.apply(lambda x: type_lookup.get(x.name, ""), axis=1) | |
df = df[df.type == feature_source] | |
if out_file is None: | |
df.to_csv(sys.stdout, sep="\t", index_label="id") | |
else: | |
df.to_csv(out_file, sep="\t", index_label="id") | |
return out_file | |
def parse_commandline(): | |
parser = argparse.ArgumentParser( | |
description='Given a counts table from featureCounts (subread) and a ' | |
'GTF/GFF FeatureDB database, output a table with a gene ' | |
'name column ("symbol") containing only counts from ' | |
'"protein_coding" exons features.') | |
parser.add_argument("-f", "--feature-type", | |
default='exon', | |
help="A feature type (GTF column 3), eg exon") | |
parser.add_argument("-s", "--feature-source", | |
default='protein_coding', | |
help="A feature 'source' (GTF column 2), " | |
"eg protein_coding") | |
parser.add_argument("-o", "--output", | |
default=None, | |
help="An output file for the filtered counts. Defaults" | |
"to stdout.") | |
parser.add_argument("-g", "--gxf-feature-file", | |
default=None, | |
help="If specified, first create the FeatureDB based" | |
"on this GTF / GFF file.") | |
parser.add_argument("count_file", help="A counts table, tab delimited.") | |
parser.add_argument("featuredb_file", | |
help="A gffutils FeatureDB, previously generated from " | |
"a GTF / GFF file. If the -g or " | |
"--gxf-feature-file option is specified, generate" | |
"this file.") | |
args = parser.parse_args() | |
return args | |
if __name__ == '__main__': | |
args = parse_commandline() | |
if args.gxf_feature_file is not None: | |
if os.path.exists(args.featuredb_file): | |
sys.stderr.write("%s would be overwritten. Terminating." % | |
args.featuredb_file) | |
sys.exit(1) | |
create_featuredb(args.gxf_feature_file, args.featuredb_file) | |
sys.stderr.write("Created %s\n" % args.featuredb_file) | |
outfn = annotate_combined_count_file(args.count_file, | |
args.featuredb_file, | |
out_file=args.output, | |
feature_type=args.feature_type, | |
feature_source=args.feature_source) | |
if outfn is not None: | |
sys.stderr.write("Created %s\n" % outfn) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment