Created
June 12, 2015 14:28
-
-
Save tbrittoborges/ef0065197fbabf337f94 to your computer and use it in GitHub Desktop.
ensembl + thiago code for fetching variants.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import requests | |
import sys | |
import utils | |
__author__ = 'tbrittoborges' | |
""" | |
Created on 14:45 04/11/2014 2014 | |
Given a list of UniProt accession id, return all missense variants. | |
All variants means: | |
1. UniProt disease causing variants; | |
1. Ensembl somatic missense variants; | |
1. Ensembl germ missense variants; | |
""" | |
import time | |
import pandas as pd | |
def get_uniprot_column(sp_accession, column=None): | |
""" | |
:param sp_accession: | |
:return: | |
""" | |
assert isinstance(sp_accession, str) | |
params = {'format': 'tab', 'query': 'accession:' + sp_accession} | |
if column: | |
params['columns'] = column | |
url = 'http://www.uniprot.org/uniprot/' | |
request = requests.get(url.format(sp_accession), | |
params=params) | |
if request.ok: | |
return request.content.splitlines()[1] | |
def get_organism(sp_accession): | |
""" | |
:param sp_accession: | |
:return: | |
""" | |
url = 'http://www.uniprot.org/uniprot/?query=id:{0}' | |
request = requests.get(url.format(sp_accession), | |
params={'format': 'tab', 'columns': 'organism'}) | |
content = request.content | |
content = content[content.find("(") + 1: content.find(")")] | |
return content.lower() | |
def remove_accession_isoform(accession): | |
""" | |
:param accession: | |
:return: | |
""" | |
try: | |
i = accession.index('-') | |
return accession[:i] | |
except ValueError: | |
return accession | |
class EnsemblRestClient(object): | |
def __init__(self, server='http://rest.ensembl.org/', reqs_per_sec=15): | |
self.server = server | |
self.reqs_per_sec = reqs_per_sec | |
self.req_count = 0 | |
self.last_req = 0 | |
def perform_rest_action(self, endpoint, hdrs=None, params=None): | |
if hdrs is None: | |
hdrs = {} | |
if 'Content-Type' not in hdrs: | |
hdrs['Content-Type'] = 'application/json' | |
if not params: | |
params = {} | |
elif not isinstance(params, dict): | |
raise (TypeError, "params needs to be a dict") | |
data = None | |
# check if we need to rate limit ourselves | |
if self.req_count >= self.reqs_per_sec: | |
delta = time.time() - self.last_req | |
if delta < 1: | |
time.sleep(1 - delta) | |
self.last_req = time.time() | |
self.req_count = 0 | |
err = 'Request failed for {0}: Status code: {1.code} ' \ | |
'Reason: {1.reason}\n' | |
data = [] | |
try: | |
r = requests.get(self.server + endpoint, headers=hdrs, | |
params=params) | |
if r.ok: | |
data = r.json() | |
self.req_count += 1 | |
else: | |
print '{} - Invalid request code: {}'.format( | |
time.time(), r.status_code) | |
# TODO: log | |
except requests.HTTPError, e: | |
# check if we are being rate limited by the server | |
if e.code == 429: | |
if 'Retry-After' in e.headers: | |
retry = e.headers['Retry-After'] | |
time.sleep(float(retry)) | |
self.perform_rest_action(endpoint, hdrs, params) | |
elif e.code in [400, 403, 404, 408]: | |
print requests.HTTPError(err.format(endpoint, e)) | |
else: | |
print requests.HTTPError(err.format(endpoint, e)) | |
pass | |
except requests.ConnectionError as e: | |
time.sleep(30) | |
self.perform_rest_action(endpoint, hdrs, params) | |
return data | |
def get_variant(self, sp_accession, features=None): | |
if not features: | |
features = ['transcript_variation', 'somatic_transcript_variation'] | |
species = get_organism(sp_accession) | |
symbols = self.perform_rest_action( | |
'xrefs/symbol/{0}/{1}'.format(species, sp_accession), | |
params={'object_type': 'translation'}) | |
data = [] | |
for symbol in symbols: | |
for feature in features: | |
data += self.perform_rest_action( | |
'overlap/translation/{0}'.format(symbol['id']), | |
params={'type': 'missense_variant', | |
'feature': feature}) | |
for d in data: | |
d['sp_accession'] = sp_accession | |
return data | |
def get_variants(self, accessions): | |
data = [] | |
for accession in accessions: | |
try: | |
data.extend(self.get_variant(accession)) | |
except TypeError: | |
print "Error with {}".format(accession) | |
return pd.DataFrame(data) | |
def human_variation_fetcher(): | |
df = pd.read_table("http://www.uniprot.org/docs/humsavar.txt", skiprows=30, | |
delim_whitespace=True, | |
names=['gene_name', 'sp_ac', 'ftid', 'aa_change', | |
'variant_type', 'dbsnp', 'disease_name']) | |
df = df.ix[:69977] | |
df['previous_aa'], df['mutated_aa'], df['position'] = zip( | |
*df.aa_change.map(process_positions)) | |
return df | |
def process_positions(aa_change_str): | |
return aa_change_str[2:5], aa_change_str[-3:], int(aa_change_str[5:-3]) | |
def process_amino_acids_changes(aa_str): | |
return aa_str[0], aa_str[-1] | |
def is_mapped(row): | |
fasta_path = 'NOBACK/oglcnac_data/oglcnac.fasta' | |
sequences = utils.fasta_parser(utils.valid_dir(fasta_path)) | |
# if sequences[row['sp_ac']] = | |
pass | |
if __name__ == "__main__": | |
# with open(sys.argv[1]) as f: | |
# sp_accessions = f.readlines() | |
# sp_accessions = [acc.replace('\n', '') for acc in sp_accessions] | |
# sp_accessions = [remove_accession_isoform(acc) for acc in sp_accessions] | |
# client = EnsemblRestClient() | |
# data = client.get_variants(sp_accessions) | |
# ens_variant = pd.DataFrame(data) | |
# ens_variant.to_csv('NOBACK/oglcnac_data/missense_variants_from_oglcnac.csv') | |
# | |
# import pandas as pd | |
# import numpy as np | |
# import variant_fetch | |
# import utils | |
# oglcnac_df = pd.read_csv("../NOBACK/oglcnac_data/oglcnac.csv", usecols=['pid', 'mod_pos']) | |
# disease = variant_fetch.human_variation_fetcher() | |
# disease = disease.query("variant_type == 'Disease'") | |
# disease = disease[disease.sp_ac.isin(oglcnac_df.pid.unique())] | |
# disease['previous_aa'], disease['mutated_aa'], disease['position'] = zip( | |
# *disease.aa_change.map(variant_fetch.process_positions)) | |
# three_to_one = dict(zip([x.title() for x in utils.three_letter_aa], | |
# utils.one_letter_aa)) | |
# disease.previous_aa = disease.previous_aa.apply(three_to_one.get) | |
# disease.mutated_aa = disease.mutated_aa.apply(three_to_one.get) | |
# disease['type'] = np.where( | |
# disease.dbsnp.str.startswith('COSM'), 'cancer', 'disease') | |
# disease = disease[ | |
# ['sp_ac', 'previous_aa', 'mutated_aa', 'position', 'dbsnp', | |
# 'ftid', 'type']].set_index(['dbsnp', 'ftid']) | |
# neutral_somatic = pd.read_csv( | |
# '../NOBACK/oglcnac_data/missense_variants_from_oglcnac.csv') | |
# neutral_somatic = neutral_somatic[neutral_somatic.sp_accession.isin( | |
# oglcnac_df.pid.unique())] | |
# | |
# neutral_somatic['previous_aa'], neutral_somatic['mutated_aa'] = zip( | |
# *neutral_somatic.residues.map(variant_fetch.process_amino_acids_changes)) | |
# neutral_somatic.rename(columns={'id': 'dbsnp', 'start': 'position', | |
# 'sp_accession': 'sp_ac'}, inplace=True) | |
# neutral_somatic['ftid'] = '-' | |
# neutral_somatic['type'] = np.where( | |
# neutral_somatic.dbsnp.str.startswith('COSM'), 'cancer', 'neutral') | |
# neutral_somatic['previous_aa'], neutral_somatic['mutated_aa'] = zip( | |
# *neutral_somatic.residues.map(variant_fetch.process_amino_acids_changes)) | |
# neutral_somatic = neutral_somatic[['sp_ac', 'previous_aa', 'mutated_aa', | |
# 'position', 'dbsnp', 'type', 'ftid']].set_index(['dbsnp', 'ftid']) | |
# X = pd.concat([disease.reset_index(), neutral_somatic.reset_index()], axis=0) | |
# X.sort(['dbsnp', 'ftid'], ascending=(1, 0), inplace=True) | |
# X.drop_duplicates(['dbsnp', 'ftid'], inplace=True, take_last=True) | |
# X.set_index(['dbsnp', 'ftid'], inplace=True) | |
# | |
# result = np.array([False] * len(X)) | |
# for index, sp_id, position in oglcnac_df.itertuples(): | |
# result = result | ((X.sp_ac == sp_id) & ((X.position < position - 7) | ( | |
# X.position > position + 7))) | |
# | |
# | |
# | |
client = EnsemblRestClient() | |
disease = human_variation_fetcher() | |
data = client.get_variants(disease.sp_ac.unique()) | |
data.to_csv('human_variants.csv', index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment