Last active
March 6, 2018 11:08
-
-
Save philippmuench/0de2e9a9fc0b3cd9e684d393ae97e64f to your computer and use it in GitHub Desktop.
shell script to download all proteins from taxids listed in intersection_ids.txt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /bin/sh | |
rm -rf log.txt # file where the taxid of skipped taxa will written to | |
max_treshold=20000 | |
mkdir -p out | |
while read line; do | |
pyla_name=$(echo $line | awk -F';' '{print $1}' | tr -s ' ' | tr ' ' '_') | |
txid=$(echo $line | awk -F';' '{print $2}') | |
echo "processing $pyla_name" | |
num_found=$(esearch -db protein -query "txid$txid[Organism:exp]"\ | |
< /dev/null | grep 'Count' | grep -o '[0-9]\+') | |
echo "checking" | |
echo $num_found | |
if [ "$num_found" -lt "$max_treshold" ];then # skip this phyla if there are too many proteins | |
echo "downloading..." | |
esearch -db protein -query "txid$txid[Organism:exp]" < /dev/null \ | |
| efetch -format fasta > out/$pyla_name.fasta | |
else | |
echo "too many proteins found ($num_found), skip" | |
echo "$pyla_name;$txid" >> log.txt | |
fi | |
done <intersection_ids.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment