Last active
February 13, 2018 13:59
-
-
Save genomewalker/67899a9f625e01ab5725f7f0bc444917 to your computer and use it in GitHub Desktop.
Create TARA subsampled (at nt level) read files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# 1. We read a folder and take all fastq files | |
# 2. We quality trim and remove short sequences | |
# 3. Run kaiju and generate output | |
#. "${MODULESHOME}"/init/bash | |
set -x | |
set -e | |
main(){ | |
trap clean_ok EXIT | |
NSLOTS=${SLURM_JOB_CPUS_PER_NODE} | |
NSUB=2000000 | |
ITER=RUN | |
NAM=NOM_${ITER} | |
IDIR=/data/oerc-microb3/afernandezguerra/tara_nogs/input | |
ODIR=/data/oerc-microb3/afernandezguerra/tara_nogs/results/${ITER}/NOM | |
if [ -d ${ODIR} ]; then | |
rm -rf ${ODIR}/${NAM}* ${ODIR}/tmp* | |
fi | |
mkdir -p ${ODIR} | |
cd ${ODIR} | |
# NAM=$(awk -v L="${SGE_TASK_ID}" 'NR==L' "${FFILE}") | |
VSEARCH=vsearch | |
BBDUK=bbduk.sh | |
ADAP=~/opt/bbmap/resources/tara.fa.gz | |
PQUAL=33 | |
# Original fastq files | |
PE1RAW="${IDIR}"/NOM_1.fastq.gz | |
PE2RAW="${IDIR}"/NOM_2.fastq.gz | |
# Subsampled fastq files | |
PE1="${ODIR}"/"${NAM}"_1.sub.fastq | |
PE2="${ODIR}"/"${NAM}"_2.sub.fastq | |
# Merging files | |
ME="${ODIR}"/"${NAM}".merged.fastq | |
NM1="${ODIR}"/"${NAM}"_1.nm.fastq | |
NM2="${ODIR}"/"${NAM}"_2.nm.fastq | |
# Quality trimming files | |
MEQC="${ODIR}"/"${NAM}".merged.qc.fastq.gz | |
NM1QC="${ODIR}"/"${NAM}"_1.nm.qc.fastq.gz | |
NM2QC="${ODIR}"/"${NAM}"_2.nm.qc.fastq.gz | |
PESQC="${ODIR}"/"${NAM}"_2.ms.qc.fastq.gz | |
SR="${ODIR}"/"${NAM}".qc.fastq.gz | |
FA="${MEQC/.merged.qc.fastq.gz/.merged.fasta}" | |
LOG="${ODIR}"/"${NAM}".log | |
if [ -f "${LOG}" ]; then rm "${LOG}"; else touch "${LOG}"; fi | |
printf "Crunching sample %s using %s threads\n\n" "${NAM}" "${NSLOTS}" | tee "${LOG}" | |
printf "Sampling ${NSUB} reads to estimate the read length and quality encoding..." | tee -a "${LOG}" | |
printf "\n\n" >> "${LOG}" | |
subsample_reads | |
check_length | |
# guess_quality | |
printf "done.\n" | tee -a "${LOG}" | |
printf "Read length: %s nt\n\n" "${SLEN}" | tee -a "${LOG}" | |
printf "Merging reads..." | tee -a "${LOG}" | |
printf "\n\n" >> "${LOG}" | |
merge_reads | |
printf "done\n\n" | tee -a "${LOG}" | |
printf "Quality trimming with BBDUK (Q20) and min length >= 75..." | tee -a "${LOG}" | |
printf "\n\n" >> "${LOG}" | |
quality_trim_paired | |
quality_trim_single | |
concat_fastq | |
printf "done\n\n" | tee -a "${LOG}" | |
# printf "Taxonomic profile with Kaiju..." | tee -a "${LOG}" | |
# taxonomy_profiler | |
# printf "done.\n\n" | tee -a "${LOG}" | |
printf "Gene prediction with FragGeneScan..." | tee -a "${LOG}" | |
# gene_prediction | |
nog_mmseqs | |
printf "done.\n\n" | tee -a "${LOG}" | |
# printf "Concatenating and dereplicating fasta files..." | tee -a "${LOG}" | |
# dereplicate | |
# printf "done.\n\n" | tee -a "${LOG}" | |
printf "Cleaning files..." | tee -a "${LOG}" | |
# clean_ok | |
printf "done.\n\n" | tee -a "${LOG}" | |
printf "%s status: DONE" "${NAM}" >> "${LOG}" | |
} | |
check_length(){ | |
SLEN=$(cat "${PE1}" | head -n 40000 | awk 'BEGIN{N=0}NR%4==2{N=N+length($0)}END{printf("%.0f", N/10000)}') | |
} | |
guess_quality(){ | |
PQUALG=$("${VSEARCH}" --fastq_chars "${PE1}" 2>&1 >/dev/null | grep 'fastq_ascii' | sed -e 's/Guess: //g') | |
PQUAL=$(echo "${PQUALG}" | awk '{print $6}') | |
#rm tmp.fastq | |
} | |
subsample_reads(){ | |
SEED=$RANDOM | |
#reformat.sh in=${PE1RAW} in2=${PE2RAW} samplereadstarget=${NSUB} out=${PE1} out2=${PE2} &>> ${LOG} | |
seqtk sample -s${SEED} ${PE1RAW} ${NSUB} > ${PE1} & | |
seqtk sample -s${SEED} ${PE2RAW} ${NSUB} > ${PE2} & | |
wait | |
} | |
merge_reads(){ | |
"${VSEARCH}" --fastq_mergepairs "${PE1}" --reverse "${PE2}" --fastqout "${ME}" --fastqout_notmerged_fwd "${NM1}" --fastqout_notmerged_rev "${NM2}" --threads "${NSLOTS}" --fastq_allowmergestagger --notrunclabels -fastq_qmin 0 -fastq_qmax 41 -fastq_ascii 33 &>> "${LOG}" | |
rm "${PE1}" "${PE2}" | |
} | |
quality_trim_paired(){ | |
"${BBDUK}" in="${NM1}" in2="${NM2}" out1="${NM1QC}" out2="${NM2QC}" outs="${PESQC}" qin="${PQUAL}" qtrim=rl trimq=20 ktrim=r k=25 mink=11 hdist=1 tbo tpe maxns=0 threads="${NSLOTS}" ref="${ADAP}" maxns=0 overwrite=true minlen=75 &>> "${LOG}" | |
rm "${NM1}" "${NM2}" | |
} | |
quality_trim_single(){ | |
"${BBDUK}" in="${ME}" out="${MEQC}" qin="${PQUAL}" qtrim=rl trimq=20 ktrim=r k=25 mink=11 hdist=1 tbo tpe maxns=0 threads="${NSLOTS}" ref="${ADAP}" maxns=0 overwrite=true minlen=75 &>> "${LOG}" | |
rm "${ME}" | |
} | |
concat_fastq(){ | |
cat "${PESQC}" >> "${MEQC}" | |
} | |
gene_prediction(){ | |
reformat.sh in=${MEQC} out=${FA} | |
# ~/opt/FragGeneScan1.30/run_FragGeneScan.pl -genome=${FA} -out="${NAM}" -complete=0 -train=illumina_5 -thread="${NSLOTS}" &>> "${LOG}" | |
# ~/opt/FragGeneScanPlus/FGS+ -s ${FA} -o ${NAM} -w 0 -p ${NSLOTS} -e 0 -d 0 -m 100000 -r ${HOME}/opt/FragGeneScanPlus/train -t illumina_5 | |
} | |
clean_ok(){ | |
rm -rf "${MEQC}" "${NM1QC}" "${NM2QC}" "${PESQC}" "${SR}" "${FA}" "${PE1}" "${PE2}" "${ME}" "${NM1}" "${NM2}" ${TMP} ${ODIR}/${NAM}_DB_aa* ${ODIR}/${NAM}_DB_orf* | |
cd .. | |
tar cvfz ${NAM}.tar.gz ${ODIR} | |
rm -rf ${ODIR} | |
} | |
clean_skip(){ | |
rm "${PE1}" "${PE2}" | |
} | |
nog_mmseqs(){ | |
TMP=$(mktemp -d -p ${ODIR}) | |
reformat.sh in=${MEQC} out=${FA} | |
mmseqs createdb ${FA} ${ODIR}/${NAM}_DB | |
mmseqs extractorfs ${ODIR}/${NAM}_DB ${ODIR}/${NAM}_DB_orf --longest-orf --min-length 30 --max-length 48000 | |
mmseqs translatenucs ${ODIR}/${NAM}_DB_orf ${ODIR}/${NAM}_DB_aa | |
mmseqs search ${ODIR}/${NAM}_DB_aa /data/oerc-microb3/afernandezguerra/biodb/bactArNOGDB ${ODIR}/${NAM}_eggnog $TMP --max-seqs 50 --threads ${NSLOTS} -a -e 1e-5 | |
} | |
main "$@" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment