Create needed folders
mkdir ft_files gb_files gb_files_filt gb_files_slice gb_files_ptt gb_files_geno
Get the assembly file from NCBI
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt
An example of the sample file used to get the NCBI data (genome_narrow.tsv)
WP_011376775.1 32231714 0.958 60 2 0 1 60 1 60 5.74e-36 123 60 60 GCF_000012645.1_ASM1264v1 hypothetical protein [Prochlorococcus marinus] hypo Bacteria Cyanobacteria Prochloraceae NA Prochlorococcus Prochlorococcus_marinus 340128
WP_032512946.1 32231714 0.905 60 6 0 1 60 1 60 2.35e-33 116 60 60 GCF_000759855.1_ASM75985v1 hypothetical protein [Prochlorococcus marinus] hypo Bacteria Cyanobacteria Prochloraceae NA Prochlorococcus Prochlorococcus_marinus 340128
Get the feature table file from NCBI
grep -w -f <(cut -f15 genome_narrow.tsv) assembly_summary.txt | cut -f 20 | while read line; do wget -l1 -r --no-parent -A "*feature_table.txt.gz" $line; done
find ftp.ncbi.nlm.nih.gov/ -name '*feature_table.txt.gz' -exec cp {} ft_files/ \;
rm -rf ftp.ncbi.nlm.nih.gov/
Get the genbank format file from NCBI
grep -w -f <(cut -f15 genome_narrow.tsv) assembly_summary.txt | cut -f 20 | while read line; do wget -l1 -r --no-parent -A "*genomic.gbff.gz" $line; done
find ftp.ncbi.nlm.nih.gov/ -name '*gbff.gz' -exec cp {} gb_files/ \;
rm -rf ftp.ncbi.nlm.nih.gov/
Get the location of our genes of interest and calculate coordinates 10kb upstream/downstream
S=10000; grep -f <(cut -f1 genome_narrow.tsv) ft_files/* | tr ':' $'\t' | cut -f 2- -d '/' | sed -e 's/_feature_table.txt//' | awk -vO=$S -vFS="\t" '{print $1,$9,$10,$11,$9-O,$10+O,$13,$17,$18,$15}' | awk -vO=$S '$2 >= O' > hits_10kb_up.txt
Split incomplete genomes in a set of contig files and keep the ones containing our gene of interest
cat hits_10kb_up.txt | while read LINE; do mawk -vV=$(echo $LINE | cut -f1 -d ' ') -v n=1 '/^\/\//{close(V".partial."n);n++;next} {print > V".partial."n}' gb_files/$(echo $LINE | cut -f1 -d ' ')_genomic.gbff; done
grep -f <(cut -f7 -d ' ' hits_10kb_up.txt) *partial* | cut -f 1 -d ':' | sort -u | while read file; do mv $file ${file/.partial.*/.gbk}; done
mv *gbk gb_files_filt
rm *partial*
Slice the genbank files 10kb upstream/downstream and convert them to ptt format
cat hits_10kb_up.txt | while read LINE; do echo '//' >> gb_files_filt/$(echo $LINE | cut -f1 -d ' ').gbk; python slicer.py -g gb_files_filt/$(echo $LINE | cut -f1 -d ' ').gbk -s $(echo $LINE | cut -f5 -d ' ') -e $(echo $LINE | cut -f6 -d ' ') -o $(echo $LINE | cut -f1 -d ' ')_slice.gbk; done
for i in *_slice.gbk; do perl gb2ptt.pl --infile $i; done
mv *slice.gbk gb_files_slice
mv *gbk.ptt gb_files_ptt
Create files for plotting the gene structures
for i in gb_files_ptt/*ptt; do NAM=$(basename $i _slice.gbk.ptt); awk 'NR>3' $i | sed -e 's/\.\./\t/' | awk -vFS="\t" -vOFS="\t" '{gsub(" ","_",$10); if($3 == "+"){S=1}else{S=-1}; print $10,$1,$2,S,"gray\t1\t1\t8\t1\tarrows"}' > ${NAM}_genoplot.tsv ; done
cat hits_10kb_up.txt | while read line; do F=$(echo $line | cut -f1 -d ' '); E=$(echo $line | cut -f7 -d ' '); awk -vC=$E '{if ($1 == C){$6="red"; $2="Genomic_unknown"}; print $0}' ${F}_genoplot.tsv | tr ' ' $'\t' | cut -f2- > tmp && mv tmp ${F}_genoplot.tsv; done
mv *_genoplot.tsv gb_files_geno
Plot with geneplot.r (Code modified from here)
Calculate synteny of the sliced files using mauve (conda install progressiveMauve
)
progressiveMauve --output=test gb_files_slice/*gbk