We get the rRNA and tRNA sequences from the non-redundant contigs using the Prokka gff and ffn files.
for i in *gff; do
DNAM=../../contigs-derep/ancientGut/${i/.gff/_derep_contigs.tsv};
DNAM=${DNAM/.proteins./.assm.};
seqkit grep -r -n -f <(awk -F'\t' '$3 == "rRNA" || $3 == "tRNA" {print $1, $9}' <(grep -f ${DNAM} $i) \
| sed -e 's/ID//;' \
| awk -F'[=;]' '{print $2}') ${i/.gff/.ffn} -o ${i/.gff/.rna.fa.gz};
done
Prepare the genomic xRNA database. After getting the gbff
files from NCBI, we extract the rRNA and tRNA sequences.
for i in *gbff.gz; do
python get-rna-seqs.py -i ${i} -o ${i/genomic.gbff.gz/rna.fa.gz};
done
Then we remove the identical sequences in each genome:
for i in *_rna.fa.gz; do
NAM=$(basename $i _rna); vsearch --fastx_uniques $i --fastaout ${NAM}_derep.fa --uc ${NAM}_derep.uc --strand both;
cat *_rna.fa.gz_derep.fa > gut_simulation_rna.fa
done
Then we search the rRNA and tRNA sequences in the genomic xRNA database:
for i in *rna.fa.gz; do
vsearch --usearch_global ${i} --db ../../genome-rna/ancientGut/gut_simulation_rna.fa --id 0.9 --userout ${i/.fa.gz/.search.tsv} --userfields query+target+id+alnlen+mism+opens+ql+tl+qcov+tcov --maxaccepts 100 --maxrejects 100 --maxhits 1 --strand both --output_no_hits --threads 16 --query_cov 0.9;
done