Created
July 27, 2015 21:05
-
-
Save muppetjones/f0dce9a6cad6cc4b9fa1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/tcsh | |
#v3.91 | |
######################################################################## | |
# WHERE ARE ALL THE kSNP SCRIPTS? | |
# IF YOU INSTALLED kSNP ANYWHERE OTHER THAN /user/local THEN YOU MUST MODIFY THIS TO POINT TO THE DIRECTORY WHERE YOU HAVE INSTALLED kSNP SCRIPTS | |
set kSNP=/usr/local/kSNP3 | |
#set kSNP=/g/g15/shea/kSNP3_Source | |
######################################################################## | |
# Example: kSNP3 -in example.fasta.list -outdir Test.out -k 13 -annotate example.annotate_list -min_frac 0.7 | |
if ($#argv == 0) then | |
printf "Usage: kSNP3 <options>\ | |
Options are the following: \ | |
-k <kmer_length> # required\n\ | |
-in <input_fastaFile_list> # required file listing full path location of each genome and the genome name, one line per genome, tab delimited between full path to genome fasta file in column 1 and genome name in column 2. This format allows multi-read,multi-chromosome, and multi-contig genomes, each genome in separate fasta. If multiple chromosomes are listed as separate fasta entries in a single genome file, positions and annotations are found for each gi number \n\ | |
-outdir <output_directory> # required \n\ | |
-annotate <annotate_list> # optional file listing genome names for which to find positions and annotate SNPs, names match column 2 of the -in file. \n\ | |
-SNPs_all <path to SNPs all file> # optional, if given then it uses existing SNPs instead of searching for new ones, and adds new genomes to the existing analysis. Assumes only the new genomes are listed in the -in file.\n\ | |
-core # optional, if present calculate core SNPs and core SNP parsimony tree\n\ | |
-ML # optional, if present calculate Maximum Likelihood tree\n\ | |
-min_frac <minimum_fraction_genomes_with_locus> # optional to create a parsimony tree based only on SNP loci that occur in at least this fraction of genomes, for example -min_frac 0.5 \n\ | |
-genbank <genbank.gbk> # optional file for SNP annotation\n\ | |
-CPU <num_CPU> # optional, number of CPU's to use, if not specified it uses all available\n\ | |
-NJ # optional, calculate a neighbor joining tree\n\ | |
-vcf # optional, create a vcf file using the first genome specified in the -positions file as the reference genome\n\ | |
-all_annotations # optional, annotate each locus exhaustively with all the annotations in any of the annotated genomes. Without this option it only provides the first annotation it comes to for a given locus, checking in the order genomes are listed in the -annotate file.\n" | |
exit | |
endif | |
set DEBUG=0 | |
echo "Location of kSNP scripts: " | |
echo "$kSNP" | |
# Tell kSNP where the input files are | |
set thisDir = `pwd` # Directory with the input files | |
echo "The home directory is $thisDir" | |
# Read in parameters from command line | |
set annotate_list="nonexistent_file" | |
set genbankFile="nonexistent_file" | |
set all_annotations=0 | |
while ($#argv > 0) | |
switch ($argv[1]) | |
case -vcf: | |
set vcf=1 | |
breaksw | |
case -NJ: | |
set nj=1 | |
breaksw | |
case -ML: | |
set ML=1 | |
breaksw | |
case -core: | |
set core=1 | |
breaksw | |
case -k: | |
shift | |
set k=$argv[1] | |
breaksw | |
case -in: | |
shift | |
set fasta_list="$argv[1]" | |
breaksw | |
case -outdir: | |
shift | |
set dir="$argv[1]" | |
breaksw | |
case -annotate: | |
shift | |
set annotate_list="$argv[1]" | |
breaksw | |
case -all_annotations: | |
set all_annotations=1 | |
breaksw | |
case -min_frac: | |
shift | |
set min_fraction_with_locus="$argv[1]" | |
breaksw | |
case -genbank: | |
shift | |
set genbankFile="$argv[1]" | |
breaksw | |
case -CPU: | |
shift | |
set num_cpus=$argv[1] | |
breaksw | |
case -SNPs_all: | |
shift | |
set SNPs_all="$argv[1]" | |
breaksw | |
default: | |
shift | |
printf "Unknown parameter $argv[1]\n" | |
endsw | |
shift | |
end | |
if ($?dir) then | |
set dir = `$kSNP/add_paths3 "$dir" "$thisDir"` | |
printf "dir $dir\n" | |
endif | |
if ($?fasta_list) then | |
set fasta_list = `$kSNP/add_paths3 "$fasta_list" "$thisDir"` | |
printf "fasta_list $fasta_list\n" | |
endif | |
if ($?annotate_list) then | |
set annotate_list = `$kSNP/add_paths3 "$annotate_list" "$thisDir"` | |
printf "annotate_list $annotate_list\n" | |
endif | |
if ($?genbankFile) then | |
set genbankFile = `$kSNP/add_paths3 "$genbankFile" "$thisDir"` | |
printf "genbankFile $genbankFile\n" | |
endif | |
if ($?SNPs_all) then | |
set SNPs_all = `$kSNP/add_paths3 "$SNPs_all" "$thisDir"` | |
printf "SNPs_all $SNPs_all\n" | |
endif | |
echo "Starting kSNP" | |
date | |
set startseconds=`date +%s` | |
echo "input fasta_list: $fasta_list" | |
echo "output directory: $dir" | |
echo "k=$k" | |
echo "annotate_list file: $annotate_list" | |
if ($all_annotations == 1) then | |
echo "Report all annotations." | |
else | |
echo "Report minimal annotations." | |
endif | |
if ( $?min_fraction_with_locus ) then | |
echo "min_fraction_with_locus: $min_fraction_with_locus" | |
endif | |
if ($?genbankFile ) then | |
if ( -e "$genbankFile") then | |
echo "Genbank file for annotations (and any from NCBI with gi number which are automatically downloaded): $genbankFile" | |
endif | |
endif | |
#get the number of CPUs | |
if !($?num_cpus) then | |
#get the operating system | |
set OS=`uname` | |
if ($OS == 'Darwin') then | |
echo "The operating system is $OS" | |
/usr/sbin/system_profiler SPHardwareDataType>wubba | |
set num_cpus=` awk '/Total Number of Cores/ {print $5}' wubba` | |
echo "There are $num_cpus CPUs" | |
rm wubba | |
##@ num_cpus=$num_proc | |
endif | |
if ($OS != 'Darwin') then | |
echo "The operating system is $OS" | |
set num_cpus=`cat /proc/cpuinfo | grep processor | wc -l` | |
endif | |
if ($num_cpus < 1 ) then | |
set num_cpus=8 | |
echo "Could not figure out the number of CPUs, will run 8 processes" | |
endif | |
endif | |
echo "Number CPUs: $num_cpus" | |
#chesk the fasta genome files to be sure line endings are Unix and fix if they are not | |
cp -f "$fasta_list" fasta_list | |
$kSNP/LE2Unix fasta_list | |
# First check genome names. Prints to STDERR if duplicate names, STDOUT list of genome names parsed for kSNP. Use names corresponding to (none, any or all of) these in the $annotate_list file. | |
#echo "Sequence names used for kSNP:" | |
#$kSNP/genome_names3 "$fasta" | |
if !( -e "$dir") then | |
mkdir "$dir" | |
endif | |
cd "$dir" | |
if ( -e "$annotate_list") then | |
cp -f "$annotate_list" annotate_list | |
else | |
touch annotate_list | |
endif | |
cp -f "$fasta_list" fasta_list | |
#DOS to unix | |
perl -i -pe 's/\015\012/\012/g' annotate_list | |
perl -i -pe 's/\015/\012/g' annotate_list | |
perl -i -pe 's/\015\012/\012/g' fasta_list | |
perl -i -pe 's/\015/\012/g' fasta_list | |
echo "Finished genomes for finding SNP positions:" | |
cat annotate_list | |
echo "" | |
# Make lookup table of genome names and fsplit# files, and create fsplit# files by merging entries of multi-contig/multi-read input genomes. | |
set count=0 | |
set num_seqs=`wc -l fasta_list | awk '{print $1}' ` | |
echo "Number of input sequences: $num_seqs " | |
printf "" >! fileName2genomeName | |
while ($count < $num_seqs) | |
set name=`awk -F'\011' -v c="$count" 'FNR==c+1 {print $2}' fasta_list` | |
set file=`awk -F'\011' -v c="$count" 'FNR==c+1 {print $1}' fasta_list` | |
printf "$count\t$name\t$file\n" | |
$kSNP/merge_fasta_reads3 "$file" >! fsplit$count | |
printf "fsplit$count\t$name\n" >> fileName2genomeName | |
@ count ++ | |
end | |
if ( $k <= 31 ) then | |
# jellyfish can do forward and reverse complement counts at same time, only the canonical direction (first in sorted list) kmer is listed, but counts are for both directions | |
date | |
echo "Running jellyfish to find k-mers" | |
foreach f (fsplit*[0-9]) | |
if !(-s kmers_all.$f) then | |
echo "$f" | |
$kSNP/jellyfish count -C -o Jelly.$f -m $k -s 1000000000 -t $num_cpus $f | |
printf "" >! unsortedkmers.$f | |
foreach i (Jelly."$f"_*) | |
$kSNP/jellyfish dump -c $i >> unsortedkmers."$f" | |
end | |
sort unsortedkmers.$f >! kmers_all.$f | |
rm -f unsortedkmers.$f | |
endif | |
end | |
echo "Finished running jellyfish" | |
endif | |
if ($k>31 ) then | |
echo "Running sa to find k-mers" | |
date | |
foreach f (fsplit*[0-9]) | |
if !(-s kmers_all.$f) then | |
$kSNP/sa $f $k 0 | |
$kSNP/rc_kmer_freqs3 $f.counts >! kmers_all.$f | |
rm -f $f.counts | |
endif | |
end | |
echo "Finished running sa" | |
date | |
endif | |
# Remove kmers that occur less than freq=average of median and mean kmer frequency for that genome. | |
echo "Removing kmers that occur less than freq=average of median and mean kmer frequency for that genome." | |
date | |
foreach f (fsplit*[0-9]) | |
awk '{print $2}' kmers_all.$f > ! freq.$f | |
set min_kmer_coverage=`$kSNP/get_quantile3 freq.$f` | |
echo "minimum kmer coverage for $f is $min_kmer_coverage" | |
awk -v m=$min_kmer_coverage '$2>=m {print}' kmers_all.$f >! kmers.$f | |
end | |
date | |
rm freq.* | |
# Remove kmers from a genome if there are conflicting alleles in that genome | |
echo "Removing conflicting kmers from each genome with conflicting alleles" | |
date | |
foreach f (fsplit*) | |
echo $f | |
mkdir Dir.$f | |
cd Dir.$f | |
$kSNP/subset_mers3 ../kmers.$f | |
printf "" >! cmds_remove_conflicting | |
foreach subset (*.mers) | |
echo "$kSNP/delete_allele_conflicts3 $subset" >> cmds_remove_conflicting | |
end | |
$kSNP/parallel_commands3 $num_cpus cmds_remove_conflicting | |
cd .. | |
end | |
echo "Finished removing conflicting kmers" | |
date | |
echo "Merged sorted kmer files and remove duplicates" | |
date | |
$kSNP/subset_mer_list3 > ! mer_list | |
printf "" >! cmds_sort | |
foreach subset (`cat mer_list`) | |
echo "sort -m -u Dir.*/$subset.conflictsDeleted > $subset" >> cmds_sort | |
end | |
$kSNP/parallel_commands3 $num_cpus cmds_sort | |
echo "Finished merging kmers across genomes" | |
date | |
################################ NEW | |
# Do not look for new SNPs, just find old ones from -SNPs_all input option | |
if ( $?SNPs_all ) then | |
# ADD GENOMES to existing SNP analysis | |
printf "Using existing SNPs from $SNPs_all file\n" | |
date | |
$kSNP/subset_SNPs_all3 "$SNPs_all" | |
foreach subset (`cat mer_list`) | |
if (-s $subset.SNPs_all) then | |
$kSNP/SNPs2fastaQuery3 $subset.SNPs_all >! SNP_loci.$subset.fasta | |
endif | |
end | |
endif | |
################################## if no -SNPs_all file or it is empty, then find new SNPs | |
if (! $?SNPs_all ) then | |
# do all the SNP finding | |
printf "Discovering new SNPs\n\n" | |
date | |
echo "Finding kmers with multiple allele variants" | |
printf "" >! cmds_pick_snps | |
foreach subset (`cat mer_list`) | |
echo "$kSNP/pick_snps_from_kmer_genome_counts3 $subset > SNP_loci.$subset.fasta" >> cmds_pick_snps | |
end | |
$kSNP/parallel_commands3 $num_cpus cmds_pick_snps | |
echo "Finished finding kmers with multiple allele variants" | |
endif | |
# Find which genome has which allele variant, by comparing the SNP_loci and Dir.$f/$subset.conflictsDeleted foreach genome | |
date | |
echo "Finding allele in each genome" | |
printf "" >! cmds_find_allele | |
foreach f (fsplit*) | |
foreach subset (`cat mer_list`) | |
echo "$kSNP/find_allele3 SNP_loci.$subset.fasta Dir.$f/$subset.conflictsDeleted $f > Dir.$f/SNPs.$subset" >> cmds_find_allele | |
end | |
end | |
$kSNP/parallel_commands3 $num_cpus cmds_find_allele | |
foreach f (fsplit*) | |
cat Dir.$f/SNPs.*.mers >! Dir.$f/SNPs | |
end | |
# Run mummer to find the position of each SNP in the finished genomes. Don't do this for unassembled draft genomes or merged raw read genomes, since positional information is not informative. | |
if (-s annotate_list) then | |
echo "Finding SNP positions in finished genomes using mummer." | |
date | |
printf "" >! cmds_mummer | |
printf "" >! cmds_parse_mummer | |
foreach genome (`cat annotate_list`) | |
set test=`grep -w $genome fileName2genomeName | wc -l` | |
set f=`grep -w $genome fileName2genomeName | awk '{print $1}'` | |
if ($test > 0 ) then | |
set file=`grep -w $genome fasta_list | awk -F'\011' '{print $1}'` | |
printf "genome: $genome in Dir.$f\n" | |
awk -F'\011' '{print ">" $1 "_" $2 "\n" $3 }' Dir.$f/SNPs >! Dir.$f/SNPs.fasta | |
printf "$kSNP/mummer -maxmatch -l $k -b -c Dir.$f/SNPs.fasta "'"'"$file"'"'" > Dir.$f/mummer.out\n" >> cmds_mummer | |
printf "$kSNP/parse_mummer4kSNP3 Dir.$f/mummer.out > Dir.$f/SNP.positions\n" >> cmds_parse_mummer | |
endif | |
end | |
$kSNP/parallel_commands3 $num_cpus cmds_mummer | |
$kSNP/parallel_commands3 $num_cpus cmds_parse_mummer | |
date | |
echo "Finished finding SNP positions in finished genomes using mummer." | |
endif | |
# concatenate SNP files for each genome into one and sort it, and number the loci | |
echo "Concatenate results for each genome and sort by locus to create SNPs_all_labelLoci" | |
date | |
printf "" >! all_SNPs_unsorted | |
foreach f (fsplit*) | |
set test=`grep -w $f fileName2genomeName | wc -l` | |
if ($test > 0 ) then | |
set genome=`grep -w $f fileName2genomeName | awk '{print $2}'` | |
printf "genome: $genome in Dir.$f\n" | |
if (-s Dir.$f/SNP.positions) then | |
awk -F'\011' -v f=$f '{print $1 "\t" $2 "\t" $3 "\t" f "\t" $4}' Dir.$f/SNP.positions >> all_SNPs_unsorted | |
#cat Dir.$f/SNP.positions >> all_SNPs_unsorted | |
else | |
awk -v genome=$genome '{print $1 "\t" $2 "\tx\t" genome "\t" }' Dir.$f/SNPs >> all_SNPs_unsorted | |
endif | |
endif | |
end | |
if ( $?SNPs_all ) then | |
# use existing SNP numbering | |
awk -F'\011' '{print $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6 "\t" $7}' "$SNPs_all" >> all_SNPs_unsorted | |
endif | |
sort -u all_SNPs_unsorted >! all_SNPs_sorted | |
$kSNP/number_SNPs_all3 all_SNPs_sorted | |
$kSNP/rename_from_table3 all_SNPs_sorted_labelLoci fileName2genomeName SNPs_all | |
# Set reference genome for vcf file to the be first finished genome, if this is empty, then set it to be the first genome in the input fasta file. | |
if (-s annotate_list) then | |
set ref_genome=`head -1 annotate_list` | |
endif | |
if !($?ref_genome) then | |
set ref_genome=`head -1 fileName2genomeName | awk '{print $2}'` | |
endif | |
if ($?vcf ) then | |
$kSNP/parse_SNPs2VCF3 SNPs_all VCF.$ref_genome.vcf $ref_genome | |
endif | |
echo "Finished finding SNPs" | |
date | |
# You can delete this Directory if everything works, but it's useful for debugging in case the run fails | |
rm -r TemporaryFilesToDelete | |
mkdir TemporaryFilesToDelete | |
mv -f Dir.* TemporaryFilesToDelete/. | |
if (-e cmds_mummer) then | |
mv -f cmds_mummer TemporaryFilesToDelete/. | |
mv -f cmds_parse_mummer TemporaryFilesToDelete/. | |
endif | |
mv -f *.mers TemporaryFilesToDelete/. | |
mv -f Jelly.* TemporaryFilesToDelete/. | |
mv -f SNP_loci.*.mers.fasta TemporaryFilesToDelete/. | |
mv -f kmers* TemporaryFilesToDelete/. | |
mv -f fsplit* TemporaryFilesToDelete/. | |
mv -f all_SNPs_unsorted TemporaryFilesToDelete/. | |
mv -f all_SNPs_sorted* TemporaryFilesToDelete/. | |
mv -f mer_list TemporaryFilesToDelete/. | |
mv -f *.mers.SNPs_all TemporaryFilesToDelete/. | |
##probes_from_SNPs_all_kmers $probe_prefix_label | |
## Create a SNP matrix and fasta, for inputting to PHYLIP, FastTreeMP or other tools like SplitsTree | |
$kSNP/SNPs_all_2_fasta_matrix3 SNPs_all SNPs_all_matrix.fasta SNPs_all_matrix | |
printf "parsimony\n" >! tree_list1 | |
printf "parsimony\n" >! tree_list2 | |
############### Make tree using SNP matrix | |
echo "Building parsimony tree" | |
# Build parsimony tree | |
$kSNP/parsimonator -s SNPs_all_matrix -n SNPs_all -N 100 -p 1234 | |
# get all the best scoring trees | |
set best_parsimony_tree_score=`grep "Parsimony tree" RAxML_info.SNPs_all | sort -k6 -n | head -1 | awk '{print $6}'` | |
set best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_all | awk -v score=$best_parsimony_tree_score '$6==score {print $14}'` | |
set Num_best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_all | awk -v score=$best_parsimony_tree_score '$6==score {print $14}' | wc -l | awk '{print $1}'` | |
printf "Number of most parsimonious trees from SNPs_all: $Num_best_parsimony_trees\n" | |
printf "Score of those trees: $best_parsimony_tree_score\n" | |
cat $best_parsimony_trees >! intree | |
# Get majority consensus tree | |
rm outfile outtree | |
#PHYLIP consense was the only tool i found that forced resolution of every branch. FastTree to give it branch lengths will crash if some notes have splits to >2 children. But you need to modify seq.h and phylip.h before compiling consense to allow longer names so they don't get truncated | |
echo "Y\n" | $kSNP/consense | |
# Give it branch lengths, optimized for the consensus parsimony tree. | |
$kSNP/force_binary_tree outtree outtree.resolved | |
$kSNP/FastTreeMP -nt -pseudo -nome -mllen -gamma -gtr -intree outtree.resolved SNPs_all_matrix.fasta >! tree.parsimony.tre | |
mv RAxML* TemporaryFilesToDelete/. | |
## Build parsimony tree from SNPs_in_majority"$min_fraction_with_locus" | |
if ($?min_fraction_with_locus) then | |
printf "Getting SNPs_in_majority$min_fraction_with_locus and building tree\n" | |
$kSNP/core_SNPs3 SNPs_all fileName2genomeName $min_fraction_with_locus | |
$kSNP/SNPs_all_2_fasta_matrix3 SNPs_in_majority"$min_fraction_with_locus" SNPs_in_majority"$min_fraction_with_locus"_matrix.fasta SNPs_in_majority"$min_fraction_with_locus"_matrix | |
# Build parsimony tree | |
$kSNP/parsimonator -s SNPs_in_majority"$min_fraction_with_locus"_matrix -n SNPs_majority"$min_fraction_with_locus" -N 100 -p 1234 | |
# get all the best scoring trees | |
set best_parsimony_tree_score=`grep "Parsimony tree" RAxML_info.SNPs_majority"$min_fraction_with_locus" | sort -k6 -n | head -1 | awk '{print $6}'` | |
set best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_majority"$min_fraction_with_locus" | awk -v score=$best_parsimony_tree_score '$6==score {print $14}'` | |
set Num_best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_majority"$min_fraction_with_locus" | awk -v score=$best_parsimony_tree_score '$6==score {print $14}' | wc -l | awk '{print $1}'` | |
printf "Number of most parsimonious trees for SNPs_in_majority$min_fraction_with_locus : $Num_best_parsimony_trees\n" | |
printf "Score of those trees: $best_parsimony_tree_score\n" | |
cat $best_parsimony_trees >! intree | |
# Get majority consensus tree | |
rm outfile outtree | |
#Find consensus parsimony tree | |
echo "Y\n" | $kSNP/consense | |
# Give it branch lengths, optimized for the consensus parsimony tree. | |
$kSNP/force_binary_tree outtree outtree.resolved | |
$kSNP/FastTreeMP -nt -pseudo -nome -mllen -gamma -gtr -intree outtree.resolved SNPs_in_majority"$min_fraction_with_locus"_matrix.fasta >! tree.majority"$min_fraction_with_locus".tre | |
mv RAxML* TemporaryFilesToDelete/. | |
# Uncomment the following line to build ML majority tree, and write over the parsimony majority tree just built | |
#$kSNP/FastTreeMP -nt -pseudo -gamma -gtr SNPs_in_majority"$min_fraction_with_locus"_matrix.fasta >! tree.majority"$min_fraction_with_locus".tre | |
foreach t ( majority"$min_fraction_with_locus" ) | |
$kSNP/label_tree_nodes3 tree.$t.tre > ! tree_nodeLabel.$t.tre | |
$kSNP/tree_nodes3 tree_nodeLabel."$t".tre nodes.$t | |
if (-s tree_nodeLabel.$t.tre ) then | |
echo "Placing SNPs on nodes $t tree" | |
$kSNP/SNPs2nodes-new3 SNPs_in_majority"$min_fraction_with_locus" nodes.$t.perlhash tree_nodeLabel.$t.tre Node_SNP_counts.$t | |
if (-e COUNT_Homoplastic_SNPs) then | |
mv COUNT_Homoplastic_SNPs COUNT_Homoplastic_SNPs.$t | |
endif | |
if (-e ClusterInfo) then | |
mv ClusterInfo ClusterInfo.$t | |
endif | |
if (-e Homoplasy_groups) then | |
mv Homoplasy_groups Homoplasy_groups.$t | |
endif | |
date | |
echo "Finished placing SNPs on nodes $t tree" | |
printf "name_on_tree\tSNP_counts\n" >! tip_SNP_counts.$t | |
grep "node: " Node_SNP_counts.$t | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> tip_SNP_counts.$t | |
if (-s tree_nodeLabel.$t.tre.rerooted) then | |
rm -f tree_nodeLabel.$t.tre | |
mv -f tree_nodeLabel.$t.tre.rerooted tree_nodeLabel.$t.tre | |
endif | |
#rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file. | |
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.tre tree_AlleleCounts.$t.tre 0 | |
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.NodeLabel.tre tree_AlleleCounts.$t.NodeLabel.tre 1 | |
endif | |
end | |
endif | |
##Building parsimony tree from only the core SNPs | |
if ($?core) then | |
printf "Getting core SNPs" | |
if (! $?min_fraction_with_locus) then | |
$kSNP/core_SNPs3 SNPs_all fileName2genomeName 0.5 | |
endif | |
$kSNP/SNPs_all_2_fasta_matrix3 core_SNPs core_SNPs_matrix.fasta core_SNPs_matrix | |
# Build parsimony tree | |
$kSNP/parsimonator -s core_SNPs_matrix -n SNPs_core -N 100 -p 1234 | |
# get all the best scoring trees | |
set best_parsimony_tree_score=`grep "Parsimony tree" RAxML_info.SNPs_core | sort -k6 -n | head -1 | awk '{print $6}'` | |
set best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_core | awk -v score=$best_parsimony_tree_score '$6==score {print $14}'` | |
set Num_best_parsimony_trees=`grep "Parsimony tree" RAxML_info.SNPs_core | awk -v score=$best_parsimony_tree_score '$6==score {print $14}' | wc -l | awk '{print $1}'` | |
printf "Number of most parsimonious trees for SNPs_core : $Num_best_parsimony_trees\n" | |
printf "Score of those trees: $best_parsimony_tree_score\n" | |
cat $best_parsimony_trees >! intree | |
# Get majority consensus tree | |
rm outfile outtree | |
#Find consensus parsimony tree | |
echo "Y\n" | $kSNP/consense | |
# Give it branch lengths, optimized for the consensus parsimony tree. | |
$kSNP/force_binary_tree outtree outtree.resolved | |
$kSNP/FastTreeMP -nt -pseudo -nome -mllen -gamma -gtr -intree outtree.resolved core_SNPs_matrix.fasta >! tree.core.tre | |
mv RAxML* TemporaryFilesToDelete/. | |
# Uncomment the following line to build ML core tree, and write over the parsimony core tree just built | |
# $kSNP/FastTreeMP -nt -gamma -gtr core_SNPs_matrix.fasta >! tree.core.tre | |
if (-s core_SNPs) then | |
foreach t ( core ) | |
$kSNP/label_tree_nodes3 tree.$t.tre > ! tree_nodeLabel.$t.tre | |
$kSNP/tree_nodes3 tree_nodeLabel."$t".tre nodes.$t | |
if (-s tree_nodeLabel.$t.tre ) then | |
echo "Placing SNPs on nodes $t tree" | |
$kSNP/SNPs2nodes-new3 core_SNPs nodes.$t.perlhash tree_nodeLabel.$t.tre Node_SNP_counts.$t | |
if (-e COUNT_Homoplastic_SNPs) then | |
mv COUNT_Homoplastic_SNPs COUNT_Homoplastic_SNPs.$t | |
endif | |
if (-e ClusterInfo) then | |
mv ClusterInfo ClusterInfo.$t | |
endif | |
if (-e Homoplasy_groups) then | |
mv Homoplasy_groups Homoplasy_groups.$t | |
endif | |
date | |
echo "Finished placing SNPs on nodes $t tree" | |
echo "" | |
printf "name_on_tree\tSNP_counts\n" >! tip_SNP_counts.$t | |
grep "node: " Node_SNP_counts.$t | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> tip_SNP_counts.$t | |
if (-s tree_nodeLabel.$t.tre.rerooted) then | |
rm -f tree_nodeLabel.$t.tre | |
mv -f tree_nodeLabel.$t.tre.rerooted tree_nodeLabel.$t.tre | |
endif | |
#rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file. | |
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.tre tree_AlleleCounts.$t.tre 0 | |
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.NodeLabel.tre tree_AlleleCounts.$t.NodeLabel.tre 1 | |
endif | |
end | |
endif | |
endif | |
## Building ML FastTree tree from all SNPs | |
if ($?ML) then | |
$kSNP/FastTreeMP -nt -pseudo -gamma -gtr SNPs_all_matrix.fasta >! tree.ML.tre | |
printf "ML\n" >> tree_list1 | |
printf "ML\n" >> tree_list2 | |
endif | |
if ( $?nj) then | |
echo "Building NJ tree" | |
date | |
# NOTE: This next line can take a long time if there are million+ SNP loci and 100+ genomes. SNP_matrix2dist_matrix does loops, so it's slow, should be parallelized. Probably should try the PHYLIP program, although scores might be different since i count them as somewhat closer if they share a locus but not the allele than if they don't even share the locus. But since NJ SNP trees are not accurate anyway, i'm not inclined to spend anymore time since no one should use this option. | |
$kSNP/SNP_matrix2dist_matrix3 SNPs_all_matrix >! NJ.dist.matrix | |
$kSNP/distance_tree3 >! tree.NJ.tre | |
echo "Finished building NJ tree" | |
printf "NJ\n" >> tree_list1 | |
printf "NJ\n" >> tree_list2 | |
date | |
endif | |
####################### | |
$kSNP/find_unresolved_clusters3 tree.parsimony.tre >! unresolved_clusters | |
date | |
echo "Finding nodes" | |
foreach t ( `cat tree_list1` ) | |
if (-s tree."$t".tre) then | |
$kSNP/label_tree_nodes3 tree.$t.tre > ! tree_nodeLabel.$t.tre | |
$kSNP/tree_nodes3 tree_nodeLabel."$t".tre nodes.$t | |
echo "Placing SNPs on nodes $t tree" | |
$kSNP/SNPs2nodes-new3 SNPs_all nodes.$t.perlhash tree_nodeLabel.$t.tre Node_SNP_counts.$t | |
if (-e COUNT_Homoplastic_SNPs) then | |
mv COUNT_Homoplastic_SNPs COUNT_Homoplastic_SNPs.$t | |
endif | |
if (-e ClusterInfo) then | |
mv ClusterInfo ClusterInfo.$t | |
endif | |
if (-e Homoplasy_groups) then | |
mv Homoplasy_groups Homoplasy_groups.$t | |
endif | |
date | |
echo "Finished placing SNPs on nodes $t tree" | |
echo "" | |
endif | |
end | |
# Relabel trees with SNP counts at nodes | |
foreach t ( `cat tree_list1` ) | |
if (-s tree."$t".tre) then | |
printf "name_on_tree\tSNP_counts\n" >! tip_SNP_counts.$t | |
grep "node: " Node_SNP_counts.$t | grep -w "NumberTargets: 1" | awk '{print $2 "\011" $6}' >> tip_SNP_counts.$t | |
if (-s tree_nodeLabel.$t.tre.rerooted) then | |
rm -f tree_nodeLabel.$t.tre | |
mv -f tree_nodeLabel.$t.tre.rerooted tree_nodeLabel.$t.tre | |
endif | |
#rm_node_names_from_tree tree_nodeLabel.$t.tre tree.$t.tre # don't overwrite tree.$t.tre anymore since we want the support values in original file. | |
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.tre tree_AlleleCounts.$t.tre 0 | |
$kSNP/labelTree_AlleleCount-new3 tree_nodeLabel.$t.tre Node_SNP_counts.$t tree_tipAlleleCounts.$t.NodeLabel.tre tree_AlleleCounts.$t.NodeLabel.tre 1 | |
endif | |
end | |
mv -f nodes.* TemporaryFilesToDelete/. | |
mv -f tree_tipAlleleCounts.*.NodeLabel.tre TemporaryFilesToDelete/. | |
mv -f tree_nodeLabel.* TemporaryFilesToDelete/. | |
######## | |
# find proteins where SNPs land, codons, amino acids, and identify nonsynonymous SNPs | |
echo "Annotating SNPs." | |
date | |
# Only get genbank file and annoate if there is positional information for some genomes, ie. annotate_list is not empty | |
if (-s annotate_list) then | |
# Get whole genome annotations from genbank, unfortunately you have to get the whole genbank file with sequence data, since the much smaller feature table does not have mature peptides making viral annotation useless with polyproteins only. | |
set count=0 | |
printf "" >! headers.annotate_list | |
foreach genome (`cat annotate_list`) | |
set file_check=`grep -w $genome fasta_list | wc -l` | |
if ($file_check > 0 ) then | |
set file=`grep -w $genome fasta_list | awk -F'\011' '{print $1}'` | |
printf "$file\n" | |
$kSNP/get_genbank_file3 "$file" genbank_from_NCBI.gbk.$count | |
fgrep ">" "$file" | sed -e "s/^>/>$genome /" >> headers.annotate_list | |
@ count ++ | |
endif | |
end | |
cat genbank_from_NCBI.gbk.* | grep -v BioProject >! genbank_from_NCBI.gbk | |
rm genbank_from_NCBI.gbk.* | |
if (-e "$genbankFile" ) then | |
$kSNP/annotate_SNPs_from_genbankFiles3 -all $all_annotations $genbankFile | |
else | |
$kSNP/annotate_SNPs_from_genbankFiles3 -all $all_annotations | |
endif | |
printf "Num_NotAnnotatedRegion\tAnnotatedNotProtein\tNum_NonSynon\tNum_Synon\tNS/S\tNSfractionOfAnnotated\tNumLoci\tNum_InAnnotatedGenomes\tNum_NotInAnnotatedGenome\n" >! Annotation_summary | |
set i=SNP_annotations | |
set num_notInAnnotatedGenome=`grep NotInAnnotatedGenome $i | awk ' {print $1}' | sort -u | wc -l | awk '{print $1}'` | |
set num_UnAnnRegion=`grep UnannotatedRegion $i | awk ' {print $1}' | sort -u | wc -l | awk '{print $1}'` | |
set num_AnnNotProtein=`grep NotProteinCoding $i | awk ' {print $1}' | sort -u | wc -l | awk '{print $1}'` | |
set NS_total=`grep -v LocusNum $i | awk ' $3>0 {print $1}' | sort -u | wc -l | awk '{print $1}'` | |
set Num_loci=`grep -v LocusNum $i | awk '{print $1}' | sort -u | wc -l | awk '{print $1}'` | |
set Num_loci_in_annotated=`grep -v LocusNum $i | grep -v NotInAnnotatedGenome | awk '{print $1}' | sort -u | wc -l | awk '{print $1}'` | |
set S_total=`perl -e "print ($Num_loci_in_annotated-$NS_total)"` | |
if ($S_total > 0) then | |
set NS_Sratio=`perl -e "print $NS_total/$S_total"` | |
else | |
set NS_Sratio="inf" | |
endif | |
if ($Num_loci_in_annotated > 0) then | |
set NSfraction_overall=`perl -e "print $NS_total/$Num_loci_in_annotated"` | |
else | |
set NSfraction_overall="inf" | |
endif | |
printf "$num_UnAnnRegion\t$num_AnnNotProtein\t$NS_total\t$S_total\t$NS_Sratio\t$NSfraction_overall\t$Num_loci\t$Num_loci_in_annotated\t$num_notInAnnotatedGenome\n" >> Annotation_summary | |
$kSNP/parse_protein_annotation_counts3 SNP_annotations >! Protein_Annotation_counts | |
echo "Finished SNP annotation." | |
endif | |
echo "Finished running kSNP" | |
date | |
set endseconds=`date +%s` | |
set elapsedTime=`perl -e "print (($endseconds-$startseconds)/60/60)"` | |
echo "Elapsed time for kSNP in hours: $elapsedTime" | |
mv cmds* TemporaryFilesToDelete/. | |
mv tree_list1 TemporaryFilesToDelete/. | |
mv tree_list2 TemporaryFilesToDelete/. | |
mv -f fileName2genomeName TemporaryFilesToDelete/. | |
rm intree outtree outfile | |
if (-s SNPs_all && -s tree.parsimony.tre && -s tree_AlleleCounts.parsimony.tre && -s unresolved_clusters && -s COUNT_SNPs && $DEBUG<1) then | |
rm -r TemporaryFilesToDelete | |
endif | |
exit | |
########################################################################################################## | |
########################################################################################################## | |
# HRE finder is not updated to work with kSNP3. Use kSNP2 if you want to use HREfinder. | |
# In case you want to run HREFinder at http://sourceforge.net/projects/hrefinder/ | |
# set your path to the hreFinder code "set hre=/path/to/hreFinder" | |
# and comment out the exit line above. | |
# YOU MUST HAVE ALL THE GENOMES IN THE -p annotate_list LIST. FOR HREFINDER YOU NEED POSITIONAL | |
# INFORMATION FOR ALL OF THEM, EVEN THE DRAFT GENOMES THAT ARE ASSEMBLED INTO A FEW LARGE CONTIGS. | |
# If some draft genomes are in alot of contigs, it is recommended that | |
# you remove those and rerun kSNP before attempting hreFinder. | |
# Don't run hreFinder with genomes that are raw unassembled reads. | |
###### Run hreFinder to predict series of SNPs likely to have been involved in homologous recombination events | |
set hre=/usr/gapps/kpath/hreFinder | |
if (-s SNPs_all) then | |
# Set reference genome for vcf file to the be first finished genome, if this is empty, then set it to be the first genome in the input fasta file. | |
if (-s annotate_list) then | |
set ref_genome=`head -1 annotate_list` | |
endif | |
if !($?ref_genome) then | |
set ref_genome=`head -1 fileName2genomeName | awk '{print $2}'` | |
endif | |
foreach tree (`cat tree_list2`) | |
mkdir HRE.$tree | |
cd HRE.$tree | |
$hre/run_config.py ../tree.$tree.tre ../fastainput ../SNPs_all $ref_genome | |
echo "" | |
echo $tree | |
echo "Number of SNPs involved in HRE events:" | |
awk '$1!="" {print $1}' hreSNPs | sort -u | wc -l | |
echo "Number of HRE events:" | |
grep -v HRE_events hre_from_to_c | awk ' total=total+$5 {} END {print total}' | |
echo "Number of HRE events from outside tree:" | |
grep -v HRE_events hre_from_to_c | grep outside | awk ' total=total+$5 {} END {print total}' | |
cd .. | |
end | |
endif | |
exit | |
# Set up standing db of genbank files so you don't have to go online to annotate SNPs | |
mkdir GenbankFiles | |
cd GenbankFiles | |
foreach domain ( Viruses Bacteria ) | |
mkdir $domain | |
cd $domain | |
foreach type (gbk) | |
mkdir Temp | |
cd Temp | |
wget "ftp://ftp.ncbi.nih.gov/genomes/$domain/all.$type.tar.gz" | |
tar -xvzf all.$type.tar.gz | |
rm *.tar.gz | |
mv */* .. | |
rm -r * | |
cd .. | |
rm -r Temp | |
end | |
cd .. | |
end | |
# get gbk files for plasmids | |
foreach domain ( Plasmids) | |
mkdir $domain | |
cd $domain | |
foreach type (gbk) | |
wget "ftp://ftp.ncbi.nih.gov/genomes/$domain/plasmids.all.$type.tar.gz" | |
tar -xvzf plasmids.all.$type.tar.gz | |
mv am/ftp-genomes/Plasmids/$type/* . | |
rm -r am | |
rm *.tar.gz | |
end | |
cd .. | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment