Last active
October 11, 2023 10:11
-
-
Save epaule/abb1a578b37f79d867b9b412b0e888fb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
GitHub Repositories | |
======================= | |
contamination files: https://github.com/epaule/btk_sequences_to_remove | |
blast scripts: https://github.com/Aquatic-Symbiosis-Genomics-Project/BLAST-scripts | |
decon blast | |
=========== | |
bash decon_blastBTK.sh <FASTA-file> <CSV file with ticks> <output directory> | |
Useful one-liners: | |
======================= | |
# create stubs from a BTK CSV file | |
perl -nE 's/"//g;chomp;@F=split(",");say "REMOVE\t$F[-1] #$F[-3]"' *.csv |sort -k3 | |
# after blasting, grep all Schisto hits and stub them into a remove | |
head -3 *out |perl -anE 'next if /==>|^\n|No hits/; say "$F[0]\t$F[10]\t$F[13] $F[14]"'|grep Schistosoma |perl -anE 'say "REMOVE\t$F[0]"' |sort -u | |
# categories from contamination files | |
cat *.contamination | grep \# |grep -v query |sort -u |perl -nE 's/\#//;chomp;push @G,"$_";END{say join(", ",@G)}' | |
# duplicated lines | |
perl -nE 'next if /^\n|\#/;print if $h{$_};$h{$_}++' daMisOron1.20220213.contamination | |
# remove duplicated decon lines | |
perl -i -ne 'if(/REMOVE\s+(\S+)/){print unless $h{$1};$h{$1}=1}else{print}' icElmAene2.20221111.haplotigs.contamination_ | |
#fire off NCBI-gx | |
bsub -o fcs_genome.log -M 400000 -n 36 -R'select[mem>400000, tmp>500G] rusage[mem=400000, tmp=600G]' bash /lustre/scratch123/tol/teams/grit/mh6/ncbi-decon/singularity_test/gx_pipeline/gx_map_wrapper.bash -o . -f fasta1.gz -f fasta2.gz -t <taxid> | |
# run the mito blast outside of the Jira-pipeline (output will be in the local work-driectory and not ): | |
$PROJECT_DIR/scripts/mito_contamination_screen.sh ./NC_040858.1.fa.gz ./bHalAlb1.20221031.fa.gz | |
#stubs from FCS | |
grep EXC drAceCamp1.20221031.fa.66205.fcs_gx_report.txt |perl -F'\t' -ane 'print if $F[6] >= 20' |sort -k6 | |
#create curation.bed lines from the gzipped fasta and the BTK CSV (need to rewrite this in crystal, so it universally works) | |
ruby -rzlib -rbio -e 's=Hash.new;Bio::FlatFile.auto(Zlib::GzipReader.open(ARGV[0])){|file|file.each{|e|s[e.entry_id]=e.seq.length}};File.open(ARGV[1]){|f|f.each_line{|line|line.chomp!;c=line.split(",");puts "#{c[1]}\t0\t#{s[c[1]]}\tREMOVE" if c[0] == "true"}}' htOpiVive1.20231009.fa.gz blast.me |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment