Last active
August 29, 2015 13:57
-
-
Save sashaphanes/9621316 to your computer and use it in GitHub Desktop.
takes a list of protein accession numbers with SNP information from STDIN, trims the SNPs, and generates a .fasta containing amino acid sequences for each accession
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use LWP::Simple; | |
# pipe STDIN to a loop that | |
# a) removes the strings after each accession number | |
# b) puts the remaining accession numbers from each line into an array | |
# $IN = \*STDIN; | |
my $line; | |
my @list; | |
foreach $line (<STDIN>){ | |
chomp($line); | |
$line =~ s/[ \t]+.*$//; | |
@list = split("\n", $line); | |
} | |
#append [accn] field to each accession | |
for (my $index=0; $index < @list; $index++) { | |
$list[$index] .= "[accn]"; | |
} | |
#join the accessions with OR | |
my $query = join('+OR+',@list); | |
#assemble the esearch URL | |
my $base = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'; | |
my $url = $base . "esearch.fcgi?db=nucleotide&term=$query&usehistory=y"; | |
#post the esearch URL | |
my $output = get($url); | |
#parse WebEnv and QueryKey | |
my $web = $1 if ($output =~ /<WebEnv>(\S+)<\/WebEnv>/); | |
my $key = $1 if ($output =~ /<QueryKey>(\d+)<\/QueryKey>/); | |
#assemble the efetch URL | |
$url = $base . "efetch.fcgi?db=protein&query_key=$key&WebEnv=$web"; | |
$url .= "&rettype=fasta&retmode=text"; | |
#post the efetch URL | |
my $fasta = get($url); | |
print "$fasta"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment