philippmuench · March 6, 2018 11:08
diff --git a/get_proteins.sh b/get_proteins.sh
 #! /bin/sh

 rm -rf log.txt # file where the taxid of skipped taxa will written to
 max_treshold=20000
 mkdir -p out
 while read line; do
  pyla_name=$(echo $line | awk -F';' '{print $1}' | tr -s ' ' | tr ' ' '_')
  txid=$(echo $line | awk -F';' '{print $2}')
  echo "processing $pyla_name"
  num_found=$(esearch -db protein -query "txid$txid[Organism:exp]"\
   < /dev/null | grep 'Count' | grep -o '[0-9]\+')
  echo "checking"
  echo $num_found
  if [ "$num_found" -lt "$max_treshold" ];then # skip this phyla if there are too many proteins
  	echo "downloading..."
    esearch -db protein -query "txid$txid[Organism:exp]" < /dev/null \
    | efetch -format fasta > out/$pyla_name.fasta
  else
    echo "too many proteins found ($num_found), skip"
    echo "$pyla_name;$txid" >> log.txt
  fi
 done <intersection_ids.txt
	#! /bin/sh

	rm -rf log.txt # file where the taxid of skipped taxa will written to
	max_treshold=20000
	mkdir -p out
	while read line; do
	pyla_name=$(echo $line \| awk -F';' '{print $1}' \| tr -s ' ' \| tr ' ' '_')
	txid=$(echo $line \| awk -F';' '{print $2}')
	echo "processing $pyla_name"
	num_found=$(esearch -db protein -query "txid$txid[Organism:exp]"\
	< /dev/null \| grep 'Count' \| grep -o '[0-9]\+')
	echo "checking"
	echo $num_found
	if [ "$num_found" -lt "$max_treshold" ];then # skip this phyla if there are too many proteins
	echo "downloading..."
	esearch -db protein -query "txid$txid[Organism:exp]" < /dev/null \
	\| efetch -format fasta > out/$pyla_name.fasta
	else
	echo "too many proteins found ($num_found), skip"
	echo "$pyla_name;$txid" >> log.txt
	fi
	done <intersection_ids.txt