Last active
December 2, 2020 12:24
-
-
Save sminot/e5b13dfbada7883430cfac8d8696222d to your computer and use it in GitHub Desktop.
Fetch gene sequences from KEGG
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import requests | |
from random import shuffle | |
# Argument parsing code courtesy of @wasade | |
import click | |
# Function to fetch the amino acid sequence for a single gene | |
def _fetch_gene(s, gene_name): | |
r = s.get(f"http://rest.kegg.jp/get/{gene_name}/aaseq") | |
assert r.status_code == 200, f"Could not retrieve gene sequence for {gene_name}" | |
return r.text | |
@click.command() | |
@click.option('--ko', type=str, required=True, | |
help='Input the gene ID ("KXXXXX") as the only argument to the ' | |
'function') | |
def fetch(ko): | |
assert ko.startswith("K"), "Must specify KO to search." | |
s = requests.Session() | |
# Fetch the list of genes linked to this accession | |
r = s.get(f"http://rest.kegg.jp/link/genes/{ko}") | |
# Make sure that we were able to fetch the list of linked genes | |
assert r.status_code == 200, f"Could not retrieve genes linked to {ko}" | |
# Store sequences in a dict | |
seqs = {} | |
# Parse the list of gene names from the data which was returned | |
gene_name_list = [ | |
l.rsplit("\t", 1)[1] | |
for l in r.text.splitlines() if "\t" in l | |
] | |
click.echo(f"Preparing to download {len(gene_name_list):,} sequences linked to {ko}") | |
# Randomly subset to 100 sequences. Change this as you like. | |
if len(gene_name_list) > 100: | |
click.echo("Subsetting to a random selection of 100") | |
shuffle(gene_name_list) | |
gene_name_list = gene_name_list[:100] | |
# Fetch each of the individual genes | |
for gene_name in gene_name_list: | |
seqs[gene_name] = _fetch_gene(s, gene_name) | |
if len(seqs) % 10 == 0: | |
click.echo(f"Fetched {len(seqs):,} gene sequences") | |
# Write out the results -- change the output file path as you like | |
with open(f"{ko}.fasta", "w") as handle: | |
handle.write("\n".join(seqs.values())) | |
# Reporting | |
click.echo(f"Wrote out {len(seqs):,} sequences to {ko}.fasta") | |
if __name__ == '__main__': | |
fetch() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Nice! A minor revision using
click
andrequest.Session
is below. It looks like you can't issue PRs against gists unfortunately...