Not related to the niche breadth analysis, we just use those that are found everywhere.
super_cl <- read_tsv("/Users/ufo/Downloads/all_cluster_components.tsv", col_names = TRUE, trim_ws = TRUE) %>%
filter(component %in% clstrs_comp_eu_ubi) %>%
select(clstr_name) %>%
write_tsv(path = "~/Downloads/eu_core_comps.tsv", col_names = FALSE)~/opt/ffindex_mg/bin/ffindex_get marine_hmp_db_03112017_eu_cons.ffdata marine_hmp_db_03112017_eu_cons.ffindex $(cat eu_core_comps.tsv) > eu_core_comps.fastahmmsearch --cpu 32 -Z 441329 --domtblout eu_spur.tblout -o eu_spur.log AntiFam.hmm eu_core_comps.fastaAnd we parse the results, hits with and e-vale < 1e-5 and a coverage >= 0.6:
grep -v '^#' eu_spur.tblout | awk '{print $4,$6,$1,$3,$13,$16,$17,$18,$19}' | sed 's/ /\t/g' | perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[2]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[4]<$c[4]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_."\n";}}' | E=1e-5 perl -e 'while(<>){chomp;@a=split(/\t/,$_);if(($a[-1]-$a[-2])>80){print $_,"\t",($a[-3]-$a[-4])/$a[1],"\n" if $a[4]<$ENV{E};}else{print $_,"\t",($a[-3]-$a[-4])/$a[1],"\n" if $a[4]<$ENV{E};}}' | awk '$NF >= 0.6' > eu_ubiq_spurious_sign.tblout
cut -f3 eu_ubiq_spurious_sign.tblout | sort -u > eu_ubiq_spurious_sign_ids.txt
filterbyname.sh in=eu_core_comps.fasta out=eu_core_comps_no_spr.fasta names=eu_ubiq_spurious_sign_ids.txt include=f ignorejunkWe are going to check who has remote homologies using hhblits against uniclust DB
~/opt/ffindex_mg/bin/ffindex_from_fasta -s eu_core_comps_no_spr.ffdata eu_core_comps_no_spr.ffindex eu_core_comps_no_spr.fasta
mpirun -np ${NSLOTS} /home/afernand/opt/ffindex_mg/bin/ffindex_apply_mpi \
/bioinf/home/afernand/SANDBOX/jackhmmer/eu_core_comps_no_spr.ff{data,index} -- hhblits.sh
~/opt/ffindex_mg/bin/ffindex_build eu_core_comps_r.ffdata eu_core_comps_r.ffindex results/
${OPENMPI_HOME}/bin/mpirun -np 16 ~/opt/ffindex_mg/bin/ffindex_apply_mpi -d eu_core_comps_parsed.ffdata -i eu_core_comps_parsed.ffindex eu_core_comps_r.ff{data,index} -- ./hh_parser.sh | pv -l | wc -l
sed -e 's/\x0//g' eu_core_comps_parsed.ffdata | cut -f1 | sort -u > eu_core_comps_hom.ids
filterbyname.sh in=eu_core_comps_no_spr.fasta out=eu_core_comps_no_spr_hom.fasta names=eu_core_comps_hom.ids include=f
sed -e 's/\x0//g' eu_core_comps_parsed.ffdata | awk '!a[$1]++' | grep -i -c 'Uncharacterized\|Hypothetical'We will use Kaiju to try to be reference free and see if we can classify some of our clusters. We are using greedy mode to have a better sensitivity and precision.
./bin/kaiju -z 32 -t nodes.dmp -f kaiju_db_nr_euk.fmi -i ../eu_core_comps_no_spr_hom.fasta -o k.out -p -a greedy -e 5
cut -f1 k.out | sort| uniq -c
./bin/addTaxonNames -p -t nodes.dmp -n names.dmp -i k.out -o k.repInitial: 6587
Antifam: 250
HHblits: 4823 (3811 have best hit as Hypothetical/Uncharacterized)
Kaiju: 81 classified
1433 with no traces in the DBs