Skip to content

Instantly share code, notes, and snippets.

@tbrittoborges
Created June 12, 2015 14:28
Show Gist options
  • Save tbrittoborges/ef0065197fbabf337f94 to your computer and use it in GitHub Desktop.
Save tbrittoborges/ef0065197fbabf337f94 to your computer and use it in GitHub Desktop.
ensembl + thiago code for fetching variants.
#!/usr/bin/env python
import requests
import sys
import utils
__author__ = 'tbrittoborges'
"""
Created on 14:45 04/11/2014 2014
Given a list of UniProt accession id, return all missense variants.
All variants means:
1. UniProt disease causing variants;
1. Ensembl somatic missense variants;
1. Ensembl germ missense variants;
"""
import time
import pandas as pd
def get_uniprot_column(sp_accession, column=None):
"""
:param sp_accession:
:return:
"""
assert isinstance(sp_accession, str)
params = {'format': 'tab', 'query': 'accession:' + sp_accession}
if column:
params['columns'] = column
url = 'http://www.uniprot.org/uniprot/'
request = requests.get(url.format(sp_accession),
params=params)
if request.ok:
return request.content.splitlines()[1]
def get_organism(sp_accession):
"""
:param sp_accession:
:return:
"""
url = 'http://www.uniprot.org/uniprot/?query=id:{0}'
request = requests.get(url.format(sp_accession),
params={'format': 'tab', 'columns': 'organism'})
content = request.content
content = content[content.find("(") + 1: content.find(")")]
return content.lower()
def remove_accession_isoform(accession):
"""
:param accession:
:return:
"""
try:
i = accession.index('-')
return accession[:i]
except ValueError:
return accession
class EnsemblRestClient(object):
def __init__(self, server='http://rest.ensembl.org/', reqs_per_sec=15):
self.server = server
self.reqs_per_sec = reqs_per_sec
self.req_count = 0
self.last_req = 0
def perform_rest_action(self, endpoint, hdrs=None, params=None):
if hdrs is None:
hdrs = {}
if 'Content-Type' not in hdrs:
hdrs['Content-Type'] = 'application/json'
if not params:
params = {}
elif not isinstance(params, dict):
raise (TypeError, "params needs to be a dict")
data = None
# check if we need to rate limit ourselves
if self.req_count >= self.reqs_per_sec:
delta = time.time() - self.last_req
if delta < 1:
time.sleep(1 - delta)
self.last_req = time.time()
self.req_count = 0
err = 'Request failed for {0}: Status code: {1.code} ' \
'Reason: {1.reason}\n'
data = []
try:
r = requests.get(self.server + endpoint, headers=hdrs,
params=params)
if r.ok:
data = r.json()
self.req_count += 1
else:
print '{} - Invalid request code: {}'.format(
time.time(), r.status_code)
# TODO: log
except requests.HTTPError, e:
# check if we are being rate limited by the server
if e.code == 429:
if 'Retry-After' in e.headers:
retry = e.headers['Retry-After']
time.sleep(float(retry))
self.perform_rest_action(endpoint, hdrs, params)
elif e.code in [400, 403, 404, 408]:
print requests.HTTPError(err.format(endpoint, e))
else:
print requests.HTTPError(err.format(endpoint, e))
pass
except requests.ConnectionError as e:
time.sleep(30)
self.perform_rest_action(endpoint, hdrs, params)
return data
def get_variant(self, sp_accession, features=None):
if not features:
features = ['transcript_variation', 'somatic_transcript_variation']
species = get_organism(sp_accession)
symbols = self.perform_rest_action(
'xrefs/symbol/{0}/{1}'.format(species, sp_accession),
params={'object_type': 'translation'})
data = []
for symbol in symbols:
for feature in features:
data += self.perform_rest_action(
'overlap/translation/{0}'.format(symbol['id']),
params={'type': 'missense_variant',
'feature': feature})
for d in data:
d['sp_accession'] = sp_accession
return data
def get_variants(self, accessions):
data = []
for accession in accessions:
try:
data.extend(self.get_variant(accession))
except TypeError:
print "Error with {}".format(accession)
return pd.DataFrame(data)
def human_variation_fetcher():
df = pd.read_table("http://www.uniprot.org/docs/humsavar.txt", skiprows=30,
delim_whitespace=True,
names=['gene_name', 'sp_ac', 'ftid', 'aa_change',
'variant_type', 'dbsnp', 'disease_name'])
df = df.ix[:69977]
df['previous_aa'], df['mutated_aa'], df['position'] = zip(
*df.aa_change.map(process_positions))
return df
def process_positions(aa_change_str):
return aa_change_str[2:5], aa_change_str[-3:], int(aa_change_str[5:-3])
def process_amino_acids_changes(aa_str):
return aa_str[0], aa_str[-1]
def is_mapped(row):
fasta_path = 'NOBACK/oglcnac_data/oglcnac.fasta'
sequences = utils.fasta_parser(utils.valid_dir(fasta_path))
# if sequences[row['sp_ac']] =
pass
if __name__ == "__main__":
# with open(sys.argv[1]) as f:
# sp_accessions = f.readlines()
# sp_accessions = [acc.replace('\n', '') for acc in sp_accessions]
# sp_accessions = [remove_accession_isoform(acc) for acc in sp_accessions]
# client = EnsemblRestClient()
# data = client.get_variants(sp_accessions)
# ens_variant = pd.DataFrame(data)
# ens_variant.to_csv('NOBACK/oglcnac_data/missense_variants_from_oglcnac.csv')
#
# import pandas as pd
# import numpy as np
# import variant_fetch
# import utils
# oglcnac_df = pd.read_csv("../NOBACK/oglcnac_data/oglcnac.csv", usecols=['pid', 'mod_pos'])
# disease = variant_fetch.human_variation_fetcher()
# disease = disease.query("variant_type == 'Disease'")
# disease = disease[disease.sp_ac.isin(oglcnac_df.pid.unique())]
# disease['previous_aa'], disease['mutated_aa'], disease['position'] = zip(
# *disease.aa_change.map(variant_fetch.process_positions))
# three_to_one = dict(zip([x.title() for x in utils.three_letter_aa],
# utils.one_letter_aa))
# disease.previous_aa = disease.previous_aa.apply(three_to_one.get)
# disease.mutated_aa = disease.mutated_aa.apply(three_to_one.get)
# disease['type'] = np.where(
# disease.dbsnp.str.startswith('COSM'), 'cancer', 'disease')
# disease = disease[
# ['sp_ac', 'previous_aa', 'mutated_aa', 'position', 'dbsnp',
# 'ftid', 'type']].set_index(['dbsnp', 'ftid'])
# neutral_somatic = pd.read_csv(
# '../NOBACK/oglcnac_data/missense_variants_from_oglcnac.csv')
# neutral_somatic = neutral_somatic[neutral_somatic.sp_accession.isin(
# oglcnac_df.pid.unique())]
#
# neutral_somatic['previous_aa'], neutral_somatic['mutated_aa'] = zip(
# *neutral_somatic.residues.map(variant_fetch.process_amino_acids_changes))
# neutral_somatic.rename(columns={'id': 'dbsnp', 'start': 'position',
# 'sp_accession': 'sp_ac'}, inplace=True)
# neutral_somatic['ftid'] = '-'
# neutral_somatic['type'] = np.where(
# neutral_somatic.dbsnp.str.startswith('COSM'), 'cancer', 'neutral')
# neutral_somatic['previous_aa'], neutral_somatic['mutated_aa'] = zip(
# *neutral_somatic.residues.map(variant_fetch.process_amino_acids_changes))
# neutral_somatic = neutral_somatic[['sp_ac', 'previous_aa', 'mutated_aa',
# 'position', 'dbsnp', 'type', 'ftid']].set_index(['dbsnp', 'ftid'])
# X = pd.concat([disease.reset_index(), neutral_somatic.reset_index()], axis=0)
# X.sort(['dbsnp', 'ftid'], ascending=(1, 0), inplace=True)
# X.drop_duplicates(['dbsnp', 'ftid'], inplace=True, take_last=True)
# X.set_index(['dbsnp', 'ftid'], inplace=True)
#
# result = np.array([False] * len(X))
# for index, sp_id, position in oglcnac_df.itertuples():
# result = result | ((X.sp_ac == sp_id) & ((X.position < position - 7) | (
# X.position > position + 7)))
#
#
#
client = EnsemblRestClient()
disease = human_variation_fetcher()
data = client.get_variants(disease.sp_ac.unique())
data.to_csv('human_variants.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment