Skip to content

Instantly share code, notes, and snippets.

@muppetjones
Created July 27, 2015 21:05
Show Gist options
  • Save muppetjones/f0dce9a6cad6cc4b9fa1 to your computer and use it in GitHub Desktop.
Save muppetjones/f0dce9a6cad6cc4b9fa1 to your computer and use it in GitHub Desktop.
#!/bin/tcsh
#v3.91
########################################################################
# WHERE ARE ALL THE kSNP SCRIPTS?
# IF YOU INSTALLED kSNP ANYWHERE OTHER THAN /user/local THEN YOU MUST MODIFY THIS TO POINT TO THE DIRECTORY WHERE YOU HAVE INSTALLED kSNP SCRIPTS
set kSNP=/usr/local/kSNP3
#set kSNP=/g/g15/shea/kSNP3_Source
########################################################################
# Example: kSNP3 -in example.fasta.list -outdir Test.out -k 13 -annotate example.annotate_list -min_frac 0.7
if ($#argv == 0) then
printf "Usage: kSNP3 <options>\
Options are the following: \
-k <kmer_length> # required\n\
-in <input_fastaFile_list> # required file listing full path location of each genome and the genome name, one line per genome, tab delimited between full path to genome fasta file in column 1 and genome name in column 2. This format allows multi-read,multi-chromosome, and multi-contig genomes, each genome in separate fasta. If multiple chromosomes are listed as separate fasta entries in a single genome file, positions and annotations are found for each gi number \n\
-outdir <output_directory> # required \n\
-annotate <annotate_list> # optional file listing genome names for which to find positions and annotate SNPs, names match column 2 of the -in file. \n\
-SNPs_all <path to SNPs all file> # optional, if given then it uses existing SNPs instead of searching for new ones, and adds new genomes to the existing analysis. Assumes only the new genomes are listed in the -in file.\n\
-core # optional, if present calculate core SNPs and core SNP parsimony tree\n\
-ML # optional, if present calculate Maximum Likelihood tree\n\
-min_frac <minimum_fraction_genomes_with_locus> # optional to create a parsimony tree based only on SNP loci that occur in at least this fraction of genomes, for example -min_frac 0.5 \n\
-genbank <genbank.gbk> # optional file for SNP annotation\n\
-CPU <num_CPU> # optional, number of CPU's to use, if not specified it uses all available\n\
-NJ # optional, calculate a neighbor joining tree\n\
-vcf # optional, create a vcf file using the first genome specified in the -positions file as the reference genome\n\
-all_annotations # optional, annotate each locus exhaustively with all the annotations in any of the annotated genomes. Without this option it only provides the first annotation it comes to for a given locus, checking in the order genomes are listed in the -annotate file.\n"
exit
endif
set DEBUG=0
echo "Location of kSNP scripts: "
echo "$kSNP"
# Tell kSNP where the input files are
set thisDir = `pwd` # Directory with the input files
echo "The home directory is $thisDir"
# Read in parameters from command line
set annotate_list="nonexistent_file"
set genbankFile="nonexistent_file"
set all_annotations=0
while ($#argv > 0)
switch ($argv[1])
case -vcf:
set vcf=1
breaksw
case -NJ:
set nj=1
breaksw
case -ML:
set ML=1
breaksw
case -core:
set core=1
breaksw
case -k:
shift
set k=$argv[1]
breaksw
case -in:
shift
set fasta_list="$argv[1]"
breaksw
case -outdir:
shift
set dir="$argv[1]"
breaksw
case -annotate:
shift
set annotate_list="$argv[1]"
breaksw
case -all_annotations:
set all_annotations=1
breaksw
case -min_frac:
shift
set min_fraction_with_locus="$argv[1]"
breaksw
case -genbank:
shift
set genbankFile="$argv[1]"
breaksw
case -CPU:
shift
set num_cpus=$argv[1]
breaksw
case -SNPs_all:
shift
set SNPs_all="$argv[1]"
breaksw
default:
shift
printf "Unknown parameter $argv[1]\n"
endsw
shift
end
if ($?dir) then
set dir = `$kSNP/add_paths3 "$dir" "$thisDir"`
printf "dir $dir\n"
endif
if ($?fasta_list) then
set fasta_list = `$kSNP/add_paths3 "$fasta_list" "$thisDir"`
printf "fasta_list $fasta_list\n"
endif
if ($?annotate_list) then
set annotate_list = `$kSNP/add_paths3 "$annotate_list" "$thisDir"`
printf "annotate_list $annotate_list\n"
endif
if ($?genbankFile) then
set genbankFile = `$kSNP/add_paths3 "$genbankFile" "$thisDir"`
printf "genbankFile $genbankFile\n"
endif
if ($?SNPs_all) then
set SNPs_all = `$kSNP/add_paths3 "$SNPs_all" "$thisDir"`
printf "SNPs_all $SNPs_all\n"
endif
echo "Starting kSNP"
date
set startseconds=`date +%s`
echo "input fasta_list: $fasta_list"
echo "output directory: $dir"
echo "k=$k"
echo "annotate_list file: $annotate_list"
if ($all_annotations == 1) then
echo "Report all annotations."
else
echo "Report minimal annotations."
endif
if ( $?min_fraction_with_locus ) then
echo "min_fraction_with_locus: $min_fraction_with_locus"
endif
if ($?genbankFile ) then
if ( -e "$genbankFile") then
echo "Genbank file for annotations (and any from NCBI with gi number which are automatically downloaded): $genbankFile"
endif
endif
#get the number of CPUs
if !($?num_cpus) then
#get the operating system
set OS=`uname`
if ($OS == 'Darwin') then
echo "The operating system is $OS"
/usr/sbin/system_profiler SPHardwareDataType>wubba
set num_cpus=` awk '/Total Number of Cores/ {print $5}' wubba`
echo "There are $num_cpus CPUs"
rm wubba
##@ num_cpus=$num_proc
endif
if ($OS != 'Darwin') then
echo "The operating system is $OS"
set num_cpus=`cat /proc/cpuinfo | grep processor | wc -l`
endif
if ($num_cpus < 1 ) then
set num_cpus=8
echo "Could not figure out the number of CPUs, will run 8 processes"
endif
endif
echo "Number CPUs: $num_cpus"
#chesk the fasta genome files to be sure line endings are Unix and fix if they are not
cp -f "$fasta_list" fasta_list
$kSNP/LE2Unix fasta_list
# First check genome names. Prints to STDERR if duplicate names, STDOUT list of genome names parsed for kSNP. Use names corresponding to (none, any or all of) these in the $annotate_list file.
#echo "Sequence names used for kSNP:"
#$kSNP/genome_names3 "$fasta"
if !( -e "$dir") then
mkdir "$dir"
endif
cd "$dir"
if ( -e "$annotate_list") then
cp -f "$annotate_list" annotate_list
else
touch annotate_list
endif
cp -f "$fasta_list" fasta_list
#DOS to unix
perl -i -pe 's/\015\012/\012/g' annotate_list
perl -i -pe 's/\015/\012/g' annotate_list
perl -i -pe 's/\015\012/\012/g' fasta_list
perl -i -pe 's/\015/\012/g' fasta_list
echo "Finished genomes for finding SNP positions:"
cat annotate_list
echo ""
# Make lookup table of genome names and fsplit# files, and create fsplit# files by merging entries of multi-contig/multi-read input genomes.
set count=0
set num_seqs=`wc -l fasta_list | awk '{print $1}' `
echo "Number of input sequences: $num_seqs "
printf "" >! fileName2genomeName
while ($count < $num_seqs)
set name=`awk -F'\011' -v c="$count" 'FNR==c+1 {print $2}' fasta_list`
set file=`awk -F'\011' -v c="$count" 'FNR==c+1 {print $1}' fasta_list`
printf "$count\t$name\t$file\n"
$kSNP/merge_fasta_reads3 "$file" >! fsplit$count
printf "fsplit$count\t$name\n" >> fileName2genomeName
@ count ++
end
if ( $k <= 31 ) then
# jellyfish can do forward and reverse complement counts at same time, only the canonical direction (first in sorted list) kmer is listed, but counts are for both directions
date
echo "Running jellyfish to find k-mers"
foreach f (fsplit*[0-9])
if !(-s kmers_all.$f) then
echo "$f"
$kSNP/jellyfish count -C -o Jelly.$f -m $k -s 1000000000 -t $num_cpus $f
printf "" >! unsortedkmers.$f
foreach i (Jelly."$f"_*)
$kSNP/jellyfish dump -c $i >> unsortedkmers."$f"
end
sort unsortedkmers.$f >! kmers_all.$f
rm -f unsortedkmers.$f
endif
end
echo "Finished running jellyfish"
endif
if ($k>31 ) then
echo "Running sa to find k-mers"
date
foreach f (fsplit*[0-9])
if !(-s kmers_all.$f) then
$kSNP/sa $f $k 0
$kSNP/rc_kmer_freqs3 $f.counts >! kmers_all.$f
rm -f $f.counts
endif
end
echo "Finished running sa"
date
endif
# Remove kmers that occur less than freq=average of median and mean kmer frequency for that genome.
echo "Removing kmers that occur less than freq=average of median and mean kmer frequency for that genome."
date
foreach f (fsplit*[0-9])
awk '{print $2}' kmers_all.$f > ! freq.$f
set min_kmer_coverage=`$kSNP/get_quantile3 freq.$f`
echo "minimum kmer coverage for $f is $min_kmer_coverage"
awk -v m=$min_kmer_coverage '$2>=m {print}' kmers_all.$f >! kmers.$f
end
date
rm freq.*
# Remove kmers from a genome if there are conflicting alleles in that genome
echo "Removing conflicting kmers from each genome with conflicting alleles"
date
foreach f (fsplit*)
echo $f
mkdir Dir.$f
cd Dir.$f
$kSNP/subset_mers3 ../kmers.$f
printf "" >! cmds_remove_conflicting
foreach subset (*.mers)
echo "$kSNP/delete_allele_conflicts3 $subset" >> cmds_remove_conflicting
end
$kSNP/parallel_commands3 $num_cpus cmds_remove_conflicting
cd ..
end
echo "Finished removing conflicting kmers"
date
echo "Merged sorted kmer files and remove duplicates"
date
$kSNP/subset_mer_list3 > ! mer_list
printf "" >! cmds_sort
foreach subset (`cat mer_list`)
echo "sort -m -u Dir.*/$subset.conflictsDeleted > $subset" >> cmds_sort
end
$kSNP/parallel_commands3 $num_cpus cmds_sort
echo "Finished merging kmers across genomes"
date
################################ NEW
# Do not look for new SNPs, just find old ones from -SNPs_all input option
if ( $?SNPs_all ) then
# ADD GENOMES to existing SNP analysis
printf "Using existing SNPs from $SNPs_all file\n"
date
$kSNP/subset_SNPs_all3 "$SNPs_all"
foreach subset (`cat mer_list`)
if (-s $subset.SNPs_all) then
$kSNP/SNPs2fastaQuery3 $subset.SNPs_all >! SNP_loci.$subset.fasta
endif
end
endif
################################## if no -SNPs_all file or it is empty, then find new SNPs
if (! $?SNPs_all ) then
# do all the SNP finding
printf "Discovering new SNPs\n\n"
date
echo "Finding kmers with multiple allele variants"
printf "" >! cmds_pick_snps
foreach subset (`cat mer_list`)
echo "$kSNP/pick_snps_from_kmer_genome_counts3 $subset > SNP_loci.$subset.fasta" >> cmds_pick_snps
end
$kSNP/parallel_commands3 $num_cpus cmds_pick_snps
echo "Finished finding kmers with multiple allele variants"
endif
# Find which genome has which allele variant, by comparing the SNP_loci and Dir.$f/$subset.conflictsDeleted foreach genome
date
echo "Finding allele in each genome"
printf "" >! cmds_find_allele
foreach f (fsplit*)
foreach subset (`cat mer_list`)
echo "$kSNP/find_allele3 SNP_loci.$subset.fasta Dir.$f/$subset.conflictsDeleted $f > Dir.$f/SNPs.$subset" >> cmds_find_allele
end
end
$kSNP/parallel_commands3 $num_cpus cmds_find_allele
foreach f (fsplit*)
cat Dir.$f/SNPs.*.mers >! Dir.$f/SNPs
end
# Run mummer to find the position of each SNP in the finished genomes. Don't do this for unassembled draft genomes or merged raw read genomes, since positional information is not informative.
if (-s annotate_list) then
echo "Finding SNP positions in finished genomes using mummer."
date
printf "" >! cmds_mummer
printf "" >! cmds_parse_mummer
foreach genome (`cat annotate_list`)
set test=`grep -w $genome fileName2genomeName | wc -l`
set f=`grep -w $genome fileName2genomeName | awk '{print $1}'`
if ($test > 0 ) then
set file=`grep -w $genome fasta_list | awk -F'\011' '{print $1}'`
printf "genome: $genome in Dir.$f\n"
awk -F'\011' '{print ">" $1 "_" $2 "\n" $3 }' Dir.$f/SNPs >! Dir.$f/SNPs.fasta
printf "$kSNP/mummer -maxmatch -l $k -b -c Dir.$f/SNPs.fasta "'"'"$file"'"'" > Dir.$f/mummer.out\n" >> cmds_mummer
printf "$kSNP/parse_mummer4kSNP3 Dir.$f/mummer.out > Dir.$f/SNP.positions\n" >> cmds_parse_mummer
endif
end
$kSNP/parallel_commands3 $num_cpus cmds_mummer
$kSNP/parallel_commands3 $num_cpus cmds_parse_mummer
date
echo "Finished finding SNP positions in finished genomes using mummer."
endif
# concatenate SNP files for each genome into one and sort it, and number the loci
echo "Concatenate results for each genome and sort by locus to create SNPs_all_labelLoci"
date
printf "" >! all_SNPs_unsorted
foreach f (fsplit*)
set test=`grep -w $f fileName2genomeName | wc -l`
if ($test > 0 ) then
set genome=`grep -w $f fileName2genomeName | awk '{print $2}'`
printf "genome: $genome in Dir.$f\n"
if (-s Dir.$f/SNP.positions) then
awk -F'\011' -v f=$f '{print $1 "\t" $2 "\t" $3 "\t" f "\t" $4}' Dir.$f/SNP.positions >> all_SNPs_unsorted
#cat Dir.$f/SNP.positions >> all_SNPs_unsorted
else
awk -v genome=$genome '{print $1 "\t" $2 "\tx\t" genome "\t" }' Dir.$f/SNPs >> all_SNPs_unsorted
endif
endif
end
if ( $?SNPs_all ) then
# use existing SNP numbering
awk -F'\011' '{print $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6 "\t" $7}' "$SNPs_all" >> all_SNPs_unsorted
endif
sort -u all_SNPs_unsorted >! all_SNPs_sorted
$kSNP/number_SNPs_all3 all_SNPs_sorted
$kSNP/rename_from_table3 all_SNPs_sorted_labelLoci fileName2genomeName SNPs_all
# Set reference genome for vcf file to the be first finished genome, if this is empty, then set it to be the first genome in the input fasta file.
if (-s annotate_list) then
set ref_genome=`head -1 annotate_list`
endif
if !($?ref_genome) then
set ref_genome=`head -1 fileName2genomeName | awk '{print $2}'`
endif
if ($?vcf ) then
$kSNP/parse_SNPs2VCF3 SNPs_all VCF.$ref_genome.vcf $ref_genome
endif
echo "Finished finding SNPs"
date
# You can delete this Directory if everything works, but it's useful for debugging in case the run fails
rm -r TemporaryFilesToDelete
mkdir TemporaryFilesToDelete
mv -f Dir.* TemporaryFilesToDelete/.
if (-e cmds_mummer) then
mv -f cmds_mummer TemporaryFilesToDelete/.
mv -f cmds_parse_mummer TemporaryFilesToDelete/.
endif
mv -f *.mers TemporaryFilesToDelete/.
mv -f Jelly.* TemporaryFilesToDelete/.
mv -f SNP_loci.*.mers.fasta TemporaryFilesToDelete/.
mv -f kmers* TemporaryFilesToDelete/.
mv -f fsplit* TemporaryFilesToDelete/.
mv -f all_SNPs_unsorted TemporaryFilesToDelete/.
mv -f all_SNPs_sorted* TemporaryFilesToDelete/.
mv -f mer_list TemporaryFilesToDelete/.
mv -f *.mers.SNPs_all TemporaryFilesToDelete/.
##probes_from_SNPs_all_kmers $probe_prefix_label
## Create a SNP matrix and fasta, for inputting to PHYLIP, FastTreeMP or other tools like SplitsTree
$kSNP/SNPs_all_2_fasta_matrix3 SNPs_all SNPs_all_matrix.fasta SNPs_all_matrix
printf "parsimony\n" >! tree_list1
printf "parsimony\n" >! tree_list2
############### Make tree using SNP matrix
echo "Building parsimony tree"
# Build parsimony tree
$kSNP/parsimonator -s SNPs_all_matrix -n SNPs_all -N 100 -p 1234
# get all the best scoring trees
set best_parsimony_tree_score=`grep "Parsimony tree" RAxML_info.SNPs_all | sort -k6 -n | head -1 | awk '{print $6}'`
set best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_all | awk -v score=$best_parsimony_tree_score '$6==score {print $14}'`
set Num_best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_all | awk -v score=$best_parsimony_tree_score '$6==score {print $14}' | wc -l | awk '{print $1}'`
printf "Number of most parsimonious trees from SNPs_all: $Num_best_parsimony_trees\n"
printf "Score of those trees: $best_parsimony_tree_score\n"
cat $best_parsimony_trees >! intree
# Get majority consensus tree
rm outfile outtree
#PHYLIP consense was the only tool i found that forced resolution of every branch. FastTree to give it branch lengths will crash if some notes have splits to >2 children. But you need to modify seq.h and phylip.h before compiling consense to allow longer names so they don't get truncated
echo "Y\n" | $kSNP/consense
# Give it branch lengths, optimized for the consensus parsimony tree.
$kSNP/force_binary_tree outtree outtree.resolved
$kSNP/FastTreeMP -nt -pseudo -nome -mllen -gamma -gtr -intree outtree.resolved SNPs_all_matrix.fasta >! tree.parsimony.tre
mv RAxML* TemporaryFilesToDelete/.
## Build parsimony tree from SNPs_in_majority"$min_fraction_with_locus"
if ($?min_fraction_with_locus) then
printf "Getting SNPs_in_majority$min_fraction_with_locus and building tree\n"
$kSNP/core_SNPs3 SNPs_all fileName2genomeName $min_fraction_with_locus
$kSNP/SNPs_all_2_fasta_matrix3 SNPs_in_majority"$min_fraction_with_locus" SNPs_in_majority"$min_fraction_with_locus"_matrix.fasta SNPs_in_majority"$min_fraction_with_locus"_matrix
# Build parsimony tree
$kSNP/parsimonator -s SNPs_in_majority"$min_fraction_with_locus"_matrix -n SNPs_majority"$min_fraction_with_locus" -N 100 -p 1234
# get all the best scoring trees
set best_parsimony_tree_score=`grep "Parsimony tree" RAxML_info.SNPs_majority"$min_fraction_with_locus" | sort -k6 -n | head -1 | awk '{print $6}'`
set best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_majority"$min_fraction_with_locus" | awk -v score=$best_parsimony_tree_score '$6==score {print $14}'`
set Num_best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_majority"$min_fraction_with_locus" | awk -v score=$best_parsimony_tree_score '$6==score {print $14}' | wc -l | awk '{print $1}'`
printf "Number of most parsimonious trees for SNPs_in_majority$min_fraction_with_locus : $Num_best_parsimony_trees\n"
printf "Score of those trees: $best_parsimony_tree_score\n"
cat $best_parsimony_trees >! intree
# Get majority consensus tree
rm outfile outtree
#Find consensus parsimony tree
echo "Y\n" | $kSNP/consense
# Give it branch lengths, optimized for the consensus parsimony tree.
$kSNP/force_binary_tree outtree outtree.resolved
$kSNP/FastTreeMP -nt -pseudo -nome -mllen -gamma -gtr -intree outtree.resolved SNPs_in_majority"$min_fraction_with_locus"_matrix.fasta >! tree.majority"$min_fraction_with_locus".tre
mv RAxML* TemporaryFilesToDelete/.
# Uncomment the following line to build ML majority tree, and write over the parsimony majority tree just built
#$kSNP/FastTreeMP -nt -pseudo -gamma -gtr SNPs_in_majority"$min_fraction_with_locus"_matrix.fasta >! tree.majority"$min_fraction_with_locus".tre
foreach t ( majority"$min_fraction_with_locus" )
$kSNP/label_tree_nodes3 tree.$t.tre > ! tree_nodeLabel.$t.tre
$kSNP/tree_nodes3 tree_nodeLabel."$t".tre nodes.$t
if (-s tree_nodeLabel.$t.tre ) then
echo "Placing SNPs on nodes $t tree"
$kSNP/SNPs2nodes-new3 SNPs_in_majority"$min_fraction_with_locus" nodes.$t.perlhash tree_nodeLabel.$t.tre Node_SNP_counts.$t
if (-e COUNT_Homoplastic_SNPs) then
mv COUNT_Homoplastic_SNPs COUNT_Homoplastic_SNPs.$t
endif
if (-e ClusterInfo) then
mv ClusterInfo ClusterInfo.$t
endif
if (-e Homoplasy_groups) then
mv Homoplasy_groups Homoplasy_groups.$t
endif
date
echo "Finished placing SNPs on nodes $t tree"
printf "name_on_tree\tSNP_counts\n" >! tip_SNP_counts.$t
grep "node: " Node_SNP_counts.$t | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> tip_SNP_counts.$t
if (-s tree_nodeLabel.$t.tre.rerooted) then
rm -f tree_nodeLabel.$t.tre
mv -f tree_nodeLabel.$t.tre.rerooted tree_nodeLabel.$t.tre
endif
#rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file.
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.tre tree_AlleleCounts.$t.tre 0
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.NodeLabel.tre tree_AlleleCounts.$t.NodeLabel.tre 1
endif
end
endif
##Building parsimony tree from only the core SNPs
if ($?core) then
printf "Getting core SNPs"
if (! $?min_fraction_with_locus) then
$kSNP/core_SNPs3 SNPs_all fileName2genomeName 0.5
endif
$kSNP/SNPs_all_2_fasta_matrix3 core_SNPs core_SNPs_matrix.fasta core_SNPs_matrix
# Build parsimony tree
$kSNP/parsimonator -s core_SNPs_matrix -n SNPs_core -N 100 -p 1234
# get all the best scoring trees
set best_parsimony_tree_score=`grep "Parsimony tree" RAxML_info.SNPs_core | sort -k6 -n | head -1 | awk '{print $6}'`
set best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_core | awk -v score=$best_parsimony_tree_score '$6==score {print $14}'`
set Num_best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_core | awk -v score=$best_parsimony_tree_score '$6==score {print $14}' | wc -l | awk '{print $1}'`
printf "Number of most parsimonious trees for SNPs_core : $Num_best_parsimony_trees\n"
printf "Score of those trees: $best_parsimony_tree_score\n"
cat $best_parsimony_trees >! intree
# Get majority consensus tree
rm outfile outtree
#Find consensus parsimony tree
echo "Y\n" | $kSNP/consense
# Give it branch lengths, optimized for the consensus parsimony tree.
$kSNP/force_binary_tree outtree outtree.resolved
$kSNP/FastTreeMP -nt -pseudo -nome -mllen -gamma -gtr -intree outtree.resolved core_SNPs_matrix.fasta >! tree.core.tre
mv RAxML* TemporaryFilesToDelete/.
# Uncomment the following line to build ML core tree, and write over the parsimony core tree just built
# $kSNP/FastTreeMP -nt -gamma -gtr core_SNPs_matrix.fasta >! tree.core.tre
if (-s core_SNPs) then
foreach t ( core )
$kSNP/label_tree_nodes3 tree.$t.tre > ! tree_nodeLabel.$t.tre
$kSNP/tree_nodes3 tree_nodeLabel."$t".tre nodes.$t
if (-s tree_nodeLabel.$t.tre ) then
echo "Placing SNPs on nodes $t tree"
$kSNP/SNPs2nodes-new3 core_SNPs nodes.$t.perlhash tree_nodeLabel.$t.tre Node_SNP_counts.$t
if (-e COUNT_Homoplastic_SNPs) then
mv COUNT_Homoplastic_SNPs COUNT_Homoplastic_SNPs.$t
endif
if (-e ClusterInfo) then
mv ClusterInfo ClusterInfo.$t
endif
if (-e Homoplasy_groups) then
mv Homoplasy_groups Homoplasy_groups.$t
endif
date
echo "Finished placing SNPs on nodes $t tree"
echo ""
printf "name_on_tree\tSNP_counts\n" >! tip_SNP_counts.$t
grep "node: " Node_SNP_counts.$t | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> tip_SNP_counts.$t
if (-s tree_nodeLabel.$t.tre.rerooted) then
rm -f tree_nodeLabel.$t.tre
mv -f tree_nodeLabel.$t.tre.rerooted tree_nodeLabel.$t.tre
endif
#rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file.
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.tre tree_AlleleCounts.$t.tre 0
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.NodeLabel.tre tree_AlleleCounts.$t.NodeLabel.tre 1
endif
end
endif
endif
## Building ML FastTree tree from all SNPs
if ($?ML) then
$kSNP/FastTreeMP -nt -pseudo -gamma -gtr SNPs_all_matrix.fasta >! tree.ML.tre
printf "ML\n" >> tree_list1
printf "ML\n" >> tree_list2
endif
if ( $?nj) then
echo "Building NJ tree"
date
# NOTE: This next line can take a long time if there are million+ SNP loci and 100+ genomes. SNP_matrix2dist_matrix does loops, so it's slow, should be parallelized. Probably should try the PHYLIP program, although scores might be different since i count them as somewhat closer if they share a locus but not the allele than if they don't even share the locus. But since NJ SNP trees are not accurate anyway, i'm not inclined to spend anymore time since no one should use this option.
$kSNP/SNP_matrix2dist_matrix3 SNPs_all_matrix >! NJ.dist.matrix
$kSNP/distance_tree3 >! tree.NJ.tre
echo "Finished building NJ tree"
printf "NJ\n" >> tree_list1
printf "NJ\n" >> tree_list2
date
endif
#######################
$kSNP/find_unresolved_clusters3 tree.parsimony.tre >! unresolved_clusters
date
echo "Finding nodes"
foreach t ( `cat tree_list1` )
if (-s tree."$t".tre) then
$kSNP/label_tree_nodes3 tree.$t.tre > ! tree_nodeLabel.$t.tre
$kSNP/tree_nodes3 tree_nodeLabel."$t".tre nodes.$t
echo "Placing SNPs on nodes $t tree"
$kSNP/SNPs2nodes-new3 SNPs_all nodes.$t.perlhash tree_nodeLabel.$t.tre Node_SNP_counts.$t
if (-e COUNT_Homoplastic_SNPs) then
mv COUNT_Homoplastic_SNPs COUNT_Homoplastic_SNPs.$t
endif
if (-e ClusterInfo) then
mv ClusterInfo ClusterInfo.$t
endif
if (-e Homoplasy_groups) then
mv Homoplasy_groups Homoplasy_groups.$t
endif
date
echo "Finished placing SNPs on nodes $t tree"
echo ""
endif
end
# Relabel trees with SNP counts at nodes
foreach t ( `cat tree_list1` )
if (-s tree."$t".tre) then
printf "name_on_tree\tSNP_counts\n" >! tip_SNP_counts.$t
grep "node: " Node_SNP_counts.$t | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> tip_SNP_counts.$t
if (-s tree_nodeLabel.$t.tre.rerooted) then
rm -f tree_nodeLabel.$t.tre
mv -f tree_nodeLabel.$t.tre.rerooted tree_nodeLabel.$t.tre
endif
#rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file.
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.tre tree_AlleleCounts.$t.tre 0
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.NodeLabel.tre tree_AlleleCounts.$t.NodeLabel.tre 1
endif
end
mv -f nodes.* TemporaryFilesToDelete/.
mv -f tree_tipAlleleCounts.*.NodeLabel.tre TemporaryFilesToDelete/.
mv -f tree_nodeLabel.* TemporaryFilesToDelete/.
########
# find proteins where SNPs land, codons, amino acids, and identify nonsynonymous SNPs
echo "Annotating SNPs."
date
# Only get genbank file and annoate if there is positional information for some genomes, ie. annotate_list is not empty
if (-s annotate_list) then
# Get whole genome annotations from genbank, unfortunately you have to get the whole genbank file with sequence data, since the much smaller feature table does not have mature peptides making viral annotation useless with polyproteins only.
set count=0
printf "" >! headers.annotate_list
foreach genome (`cat annotate_list`)
set file_check=`grep -w $genome fasta_list | wc -l`
if ($file_check > 0 ) then
set file=`grep -w $genome fasta_list | awk -F'\011' '{print $1}'`
printf "$file\n"
$kSNP/get_genbank_file3 "$file" genbank_from_NCBI.gbk.$count
fgrep ">" "$file" | sed -e "s/^>/>$genome /" >> headers.annotate_list
@ count ++
endif
end
cat genbank_from_NCBI.gbk.* | grep -v BioProject >! genbank_from_NCBI.gbk
rm genbank_from_NCBI.gbk.*
if (-e "$genbankFile" ) then
$kSNP/annotate_SNPs_from_genbankFiles3 -all $all_annotations $genbankFile
else
$kSNP/annotate_SNPs_from_genbankFiles3 -all $all_annotations
endif
printf "Num_NotAnnotatedRegion\tAnnotatedNotProtein\tNum_NonSynon\tNum_Synon\tNS/S\tNSfractionOfAnnotated\tNumLoci\tNum_InAnnotatedGenomes\tNum_NotInAnnotatedGenome\n" >! Annotation_summary
set i=SNP_annotations
set num_notInAnnotatedGenome=`grep NotInAnnotatedGenome $i | awk ' {print $1}' | sort -u | wc -l | awk '{print $1}'`
set num_UnAnnRegion=`grep UnannotatedRegion $i | awk ' {print $1}' | sort -u | wc -l | awk '{print $1}'`
set num_AnnNotProtein=`grep NotProteinCoding $i | awk ' {print $1}' | sort -u | wc -l | awk '{print $1}'`
set NS_total=`grep -v LocusNum $i | awk ' $3>0 {print $1}' | sort -u | wc -l | awk '{print $1}'`
set Num_loci=`grep -v LocusNum $i | awk '{print $1}' | sort -u | wc -l | awk '{print $1}'`
set Num_loci_in_annotated=`grep -v LocusNum $i | grep -v NotInAnnotatedGenome | awk '{print $1}' | sort -u | wc -l | awk '{print $1}'`
set S_total=`perl -e "print ($Num_loci_in_annotated-$NS_total)"`
if ($S_total > 0) then
set NS_Sratio=`perl -e "print $NS_total/$S_total"`
else
set NS_Sratio="inf"
endif
if ($Num_loci_in_annotated > 0) then
set NSfraction_overall=`perl -e "print $NS_total/$Num_loci_in_annotated"`
else
set NSfraction_overall="inf"
endif
printf "$num_UnAnnRegion\t$num_AnnNotProtein\t$NS_total\t$S_total\t$NS_Sratio\t$NSfraction_overall\t$Num_loci\t$Num_loci_in_annotated\t$num_notInAnnotatedGenome\n" >> Annotation_summary
$kSNP/parse_protein_annotation_counts3 SNP_annotations >! Protein_Annotation_counts
echo "Finished SNP annotation."
endif
echo "Finished running kSNP"
date
set endseconds=`date +%s`
set elapsedTime=`perl -e "print (($endseconds-$startseconds)/60/60)"`
echo "Elapsed time for kSNP in hours: $elapsedTime"
mv cmds* TemporaryFilesToDelete/.
mv tree_list1 TemporaryFilesToDelete/.
mv tree_list2 TemporaryFilesToDelete/.
mv -f fileName2genomeName TemporaryFilesToDelete/.
rm intree outtree outfile
if (-s SNPs_all && -s tree.parsimony.tre && -s tree_AlleleCounts.parsimony.tre && -s unresolved_clusters && -s COUNT_SNPs && $DEBUG<1) then
rm -r TemporaryFilesToDelete
endif
exit
##########################################################################################################
##########################################################################################################
# HRE finder is not updated to work with kSNP3. Use kSNP2 if you want to use HREfinder.
# In case you want to run HREFinder at http://sourceforge.net/projects/hrefinder/
# set your path to the hreFinder code "set hre=/path/to/hreFinder"
# and comment out the exit line above.
# YOU MUST HAVE ALL THE GENOMES IN THE -p annotate_list LIST. FOR HREFINDER YOU NEED POSITIONAL
# INFORMATION FOR ALL OF THEM, EVEN THE DRAFT GENOMES THAT ARE ASSEMBLED INTO A FEW LARGE CONTIGS.
# If some draft genomes are in alot of contigs, it is recommended that
# you remove those and rerun kSNP before attempting hreFinder.
# Don't run hreFinder with genomes that are raw unassembled reads.
###### Run hreFinder to predict series of SNPs likely to have been involved in homologous recombination events
set hre=/usr/gapps/kpath/hreFinder
if (-s SNPs_all) then
# Set reference genome for vcf file to the be first finished genome, if this is empty, then set it to be the first genome in the input fasta file.
if (-s annotate_list) then
set ref_genome=`head -1 annotate_list`
endif
if !($?ref_genome) then
set ref_genome=`head -1 fileName2genomeName | awk '{print $2}'`
endif
foreach tree (`cat tree_list2`)
mkdir HRE.$tree
cd HRE.$tree
$hre/run_config.py ../tree.$tree.tre ../fastainput ../SNPs_all $ref_genome
echo ""
echo $tree
echo "Number of SNPs involved in HRE events:"
awk '$1!="" {print $1}' hreSNPs | sort -u | wc -l
echo "Number of HRE events:"
grep -v HRE_events hre_from_to_c | awk ' total=total+$5 {} END {print total}'
echo "Number of HRE events from outside tree:"
grep -v HRE_events hre_from_to_c | grep outside | awk ' total=total+$5 {} END {print total}'
cd ..
end
endif
exit
# Set up standing db of genbank files so you don't have to go online to annotate SNPs
mkdir GenbankFiles
cd GenbankFiles
foreach domain ( Viruses Bacteria )
mkdir $domain
cd $domain
foreach type (gbk)
mkdir Temp
cd Temp
wget "ftp://ftp.ncbi.nih.gov/genomes/$domain/all.$type.tar.gz"
tar -xvzf all.$type.tar.gz
rm *.tar.gz
mv */* ..
rm -r *
cd ..
rm -r Temp
end
cd ..
end
# get gbk files for plasmids
foreach domain ( Plasmids)
mkdir $domain
cd $domain
foreach type (gbk)
wget "ftp://ftp.ncbi.nih.gov/genomes/$domain/plasmids.all.$type.tar.gz"
tar -xvzf plasmids.all.$type.tar.gz
mv am/ftp-genomes/Plasmids/$type/* .
rm -r am
rm *.tar.gz
end
cd ..
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment