Skip to content

Instantly share code, notes, and snippets.

@genomewalker
Last active May 11, 2018 06:45
Show Gist options
  • Save genomewalker/753cdb05822d77276621a475dc1b5830 to your computer and use it in GitHub Desktop.
Save genomewalker/753cdb05822d77276621a475dc1b5830 to your computer and use it in GitHub Desktop.

18S magicblast

dada2_input filtered denoised merged table no_chimeras perc_reads_survived
ERR1018543 6525 4002 3727 3629 3629 3076 47.1
ERR1018546 2421 1513 1276 1229 1229 1095 45.2

Total counts: 4,171

Kingdom n
Eukaryota 322

16S magicblast

dada2_input filtered denoised merged table no_chimeras perc_reads_survived
ERR1018543 30135 21594 20993 15563 15563 12324 40.9
ERR1018546 33381 25166 24647 19719 19719 15221 45.6

Total counts: 27,545

Kingdom n
Archaea 14
Bacteria 469
Eukaryota 6
NA 6

18S ssu-align

dada2_input filtered denoised merged table no_chimeras perc_reads_survived
ERR1018543 10164 5743 5457 5263 5263 4701 46.3
ERR1018546 4248 2420 2162 2098 2098 1901 44.8

Total counts: 6,602

Kingdom n
Eukaryota 388

16S ssu-align

dada2_input filtered denoised merged table no_chimeras perc_reads_survived
ERR1018543 26496 20002 19443 15568 15568 12328 46.5
ERR1018546 31554 24338 23812 19606 19606 15075 47.8

Total counts: 27,403

Kingdom n
Archaea 14
Bacteria 465
Eukaryota 6
NA 8
# Reformat fastq to fasta
reformat.sh in=ERR1018543_1_trimmed.fq.gz in2=ERR1018543_2_trimmed.fq.gz out=ERR1018543_1_trimmed.fasta out2=ERR1018543_2_trimmed.fasta
reformat.sh in=ERR1018546_1_trimmed.fq.gz in2=ERR1018546_2_trimmed.fq.gz out=ERR1018546_1_trimmed.fasta out2=ERR1018546_2_trimmed.fasta
# Prepare ssu-align to be run in parallel
ssu-prep -f -x ERR1018543_1_trimmed.fasta ssu-align_ERR1018543_1 32
ssu-prep -f -x ERR1018543_2_trimmed.fasta ssu-align_ERR1018543_2 32
ssu-prep -f -x ERR1018546_1_trimmed.fasta ssu-align_ERR1018546_1 32
ssu-prep -f -x ERR1018546_2_trimmed.fasta ssu-align_ERR1018546_2 32
# Run ssu-align
./ssu-align_ERR1018543_1.ssu-align.sh
./ssu-align_ERR1018543_2.ssu-align.sh
./ssu-align_ERR1018546_1.ssu-align.sh
./ssu-align_ERR1018546_2.ssu-align.sh
# Get read ids for the 16S and 18S reads identified by ssu-align
PAIR=ERR1018543_1; find ssu-align_"${PAIR}" -regex '.*\(archaea.fa\|bacteria.fa\)$' -exec grep '>' {} \; | tr -d '>' | cut -f 1 -d ' ' > "${PAIR}"_16S_ssu-align.ids
PAIR=ERR1018543_2; find ssu-align_"${PAIR}" -regex '.*\(archaea.fa\|bacteria.fa\)$' -exec grep '>' {} \; | tr -d '>' | cut -f 1 -d ' ' > "${PAIR}"_16S_ssu-align.ids
PAIR=ERR1018546_1; find ssu-align_"${PAIR}" -regex '.*\(archaea.fa\|bacteria.fa\)$' -exec grep '>' {} \; | tr -d '>' | cut -f 1 -d ' ' > "${PAIR}"_16S_ssu-align.ids
PAIR=ERR1018546_2; find ssu-align_"${PAIR}" -regex '.*\(archaea.fa\|bacteria.fa\)$' -exec grep '>' {} \; | tr -d '>' | cut -f 1 -d ' ' > "${PAIR}"_16S_ssu-align.ids
PAIR=ERR1018543_1; find ssu-align_"${PAIR}" -regex '.*\(eukarya.fa\)$' -exec grep '>' {} \; | tr -d '>' | cut -f 1 -d ' ' > "${PAIR}"_18S_ssu-align.ids
PAIR=ERR1018543_2; find ssu-align_"${PAIR}" -regex '.*\(eukarya.fa\)$' -exec grep '>' {} \; | tr -d '>' | cut -f 1 -d ' ' > "${PAIR}"_18S_ssu-align.ids
PAIR=ERR1018546_1; find ssu-align_"${PAIR}" -regex '.*\(eukarya.fa\)$' -exec grep '>' {} \; | tr -d '>' | cut -f 1 -d ' ' > "${PAIR}"_18S_ssu-align.ids
PAIR=ERR1018546_2; find ssu-align_"${PAIR}" -regex '.*\(eukarya.fa\)$' -exec grep '>' {} \; | tr -d '>' | cut -f 1 -d ' ' > "${PAIR}"_18S_ssu-align.ids
# Split reads (we will only use a combination of FWD/REV ids, a more strict approach would be to select only those reads that have been in aligned in FWD/REV)
SSU=16S; SAMPLE=ERR1018543; filterbyname.sh in="${SAMPLE}"_1_trimmed.fq.gz in2="${SAMPLE}"_2_trimmed.fq.gz out="${SSU}"_"${SAMPLE}"_1_trimmed_ssualign.fq.gz out2="${SSU}"_"${SAMPLE}"_2_trimmed_ssualign.fq.gz names=<(cat "${SAMPLE}"_1_18S_ssu-align.ids "${SAMPLE}"_2_18S_ssu-align.ids) include=f
SSU=18S; SAMPLE=ERR1018543; filterbyname.sh in="${SAMPLE}"_1_trimmed.fq.gz in2="${SAMPLE}"_2_trimmed.fq.gz out="${SSU}"_"${SAMPLE}"_1_trimmed_ssualign.fq.gz out2="${SSU}"_"${SAMPLE}"_2_trimmed_ssualign.fq.gz names=<(cat "${SAMPLE}"_1_18S_ssu-align.ids "${SAMPLE}"_2_18S_ssu-align.ids) include=t
SSU=16S; SAMPLE=ERR1018546; filterbyname.sh in="${SAMPLE}"_1_trimmed.fq.gz in2="${SAMPLE}"_2_trimmed.fq.gz out="${SSU}"_"${SAMPLE}"_1_trimmed_ssualign.fq.gz out2="${SSU}"_"${SAMPLE}"_2_trimmed_ssualign.fq.gz names=<(cat "${SAMPLE}"_1_"${SSU}"_ssu-align.ids "${SAMPLE}"_2_"${SSU}"_ssu-align.ids) include=f
SSU=18S; SAMPLE=ERR1018546; filterbyname.sh in="${SAMPLE}"_1_trimmed.fq.gz in2="${SAMPLE}"_2_trimmed.fq.gz out="${SSU}"_"${SAMPLE}"_1_trimmed_ssualign.fq.gz out2="${SSU}"_"${SAMPLE}"_2_trimmed_ssualign.fq.gz names=<(cat "${SAMPLE}"_1_18S_ssu-align.ids "${SAMPLE}"_2_18S_ssu-align.ids) include=t
# Some stats
seqkit stats 1*S_*trimmed_ss*.fq.gz
file format type num_seqs sum_len min_len avg_len max_len
16S_ERR1018543_1_trimmed_ssualign.fq.gz FASTQ DNA 26,496 7,348,681 255 277.4 296
16S_ERR1018543_2_trimmed_ssualign.fq.gz FASTQ DNA 26,496 7,851,808 250 296.3 305
16S_ERR1018546_1_trimmed_ssualign.fq.gz FASTQ DNA 31,554 8,748,253 250 277.2 296
16S_ERR1018546_2_trimmed_ssualign.fq.gz FASTQ DNA 31,554 9,373,913 263 297.1 305
18S_ERR1018543_1_trimmed_ssualign.fq.gz FASTQ DNA 10,164 2,818,891 276 277.3 296
18S_ERR1018543_2_trimmed_ssualign.fq.gz FASTQ DNA 10,164 3,013,853 285 296.5 305
18S_ERR1018546_1_trimmed_ssualign.fq.gz FASTQ DNA 4,248 1,177,629 276 277.2 296
18S_ERR1018546_2_trimmed_ssualign.fq.gz FASTQ DNA 4,248 1,262,690 263 297.2 305
seqkit stats 1*S_*trimmed.fq.gz
file format type num_seqs sum_len min_len avg_len max_len
16S_ERR1018543_1_trimmed.fq.gz FASTQ DNA 30,135 8,357,879 255 277.3 296
16S_ERR1018543_2_trimmed.fq.gz FASTQ DNA 30,135 8,931,640 250 296.4 305
16S_ERR1018546_1_trimmed.fq.gz FASTQ DNA 33,381 9,254,697 250 277.2 296
16S_ERR1018546_2_trimmed.fq.gz FASTQ DNA 33,381 9,916,913 263 297.1 305
18S_ERR1018543_1_trimmed.fq.gz FASTQ DNA 6,525 1,809,693 276 277.3 294
18S_ERR1018543_2_trimmed.fq.gz FASTQ DNA 6,525 1,934,021 285 296.4 305
18S_ERR1018546_1_trimmed.fq.gz FASTQ DNA 2,421 671,185 276 277.2 291
18S_ERR1018546_2_trimmed.fq.gz FASTQ DNA 2,421 719,690 285 297.3 305
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment