Skip to content

Instantly share code, notes, and snippets.

@lindenb
Created May 29, 2018 07:18
Show Gist options
  • Save lindenb/f4443b806b8118a478ff2ba387b02dd7 to your computer and use it in GitHub Desktop.
Save lindenb/f4443b806b8118a478ff2ba387b02dd7 to your computer and use it in GitHub Desktop.
test nextflow anonymized , multi-calling, 1 vcf per bed line, 3 methods hapcaller|samtools|freebayes
REF="/path/to/human_g1k_v37.fasta"
java_exe="java"
gatk_jar="/path/to/GenomeAnalysisTK.jar"
params.test_flag="F"
params.prefix="20180515.NEXTFLOW"
params.callers=["samtools","freebayes","hapcaller"]
params.description="${params.prefix} : Pierre test Nextflow "
captures = Channel.from(
"Capture01", "capture01.bed",
"Capture02", "capture02.bed",
"Capture03", "capture03.bed",
"Capture04", "capture04.bed",
).
buffer( size: 2 ).
map{T->[T[0],file(T[1])]}.
dump()
String jvarkitjar(String name) { " /path/to/jvarkit-git/${name}.jar " }
process makeBamList {
tag "bam list"
output:
file("bam.list") into bam_list
script:
"""
find dir1 dir2 dir3 dir3 dir4 -type f \
-name "*final.bam" > bam.list
if [ "${params.test_flag}" == "T" ]
then
cat bam.list | shuf | head > tmp && mv tmp bam.list
fi
grep -m1 -F "_final.bam" bam.list > /dev/null
"""
}
process normalizeBed {
tag "normalizeBed ${bedName}:${bedFile}"
echo true
input:
set val(bedName),file(bedFile) from captures
output:
set bedName,file("${bedName}.bed.gz") into normBed1,normBed2
file("${bedName}.bed.gz.tbi")
script:
"""
cat "${bedFile}" |\\
tr -d '\\r' |\\
grep -v -E '^(browser|track)' |\\
cut -f 1,2,3 | sed 's/^chr//' |\\
LC_ALL=C sort -t \$'\\t' -k1,1 -k2,2n |\\
uniq |
bgzip > "${bedName}.bed.gz"
tabix -f -p bed "${bedName}.bed.gz"
"""
}
process mergeBeds {
tag "Merge ${beds.size()} capture(s)"
input:
val beds from normBed1.map{T->T[1]}.flatten().collect()
output:
file("capture.bed") into cleanbed1
script:
"""
gunzip -c ${beds.join(" ")} |
LC_ALL=C sort -t \$'\\t' -k1,1 -k2,2n |\\
uniq |\\
bedtools slop -b 1000 -g "${REF}.fai" |\\
bedtools merge -d 1000 -i - > capture.bed
if [ "${params.test_flag}" == "T" ]
then
cat capture.bed | shuf | head -n 30 > tmp && mv tmp capture.bed
fi
"""
}
process distinctContigs {
tag "distinct contigs"
input:
file capture from cleanbed1
output:
file ("contigs.list") into distinct_contigs_file
script:
"""
cut -f 1 "${capture}" | uniq | LC_ALL=C sort -V |uniq > contigs.list
test -s contigs.list
"""
}
process callBamSamtools {
tag "call samtools (${chrom}:${start}-${end}) and ${bam_list}"
when:
params.callers.contains("samtools")
input:
set chrom,start,end,bam_list from cleanbed1.
splitCsv(header: false,sep:'\t',strip:true).
combine(bam_list)
output:
set chrom,start,end,file("samtools.${chrom}_${start}_${end}.vcf.gz"),val("samtools") into called_vcfs_samtools
script:
"""
if [ "${params.test_flag}" == "T" ]
then
touch "samtools.${chrom}_${start}_${end}.vcf.gz"
else
bcftools mpileup -Ou -f "${REF}" \\
--bam-list "${bam_list}" \\
--region "${chrom}:${start}-${end}" \\
--annotate 'FORMAT/AD,FORMAT/ADF,FORMAT/ADR,FORMAT/DP,FORMAT/SP,INFO/AD,INFO/ADF,INFO/ADR' \\
--redo-BAQ --adjust-MQ 50 --min-MQ 30 \\
--count-orphans \\
--max-depth 10000 \\
--max-idepth 10000 |\\
bcftools call \\
--ploidy GRCh37 \\
--multiallelic-caller \\
--variants-only -O z -o "samtools.${chrom}_${start}_${end}.vcf.gz"
fi
"""
}
process callBamFreebayes {
tag "call freebayes (${chrom}:${start}-${end}) and ${bam_list}"
when:
params.callers.contains("freebayes")
input:
set chrom,start,end,bam_list from cleanbed1.
splitCsv(header: false,sep:'\t',strip:true).
combine(bam_list)
output:
set chrom,start,end,file("freebayes.${chrom}_${start}_${end}.vcf.gz"),val("freebayes") into called_vcfs_freebayes
script:
"""
if [ "${params.test_flag}" == "T" ]
then
touch "freebayes.${chrom}_${start}_${end}.vcf.gz"
else
## ça bug parce que freebayes veut des read-groups disctint/samples
echo "${chrom} ${start} ${end}" > tmp.bed
${java_exe} -jar /path/to/picard.jar BedToIntervalList I=tmp.bed O=tmp.interval SD=`echo "${REF}" | sed 's/.fasta\$/.dict/'`
rm tmp.bed
sed 's/^/I=/' ${bam_list} > tmp.args
echo 'AS=true' >> tmp.args
echo 'RGN=tmp.interval' >> tmp.args
echo 'CREATE_INDEX=true' >> tmp.args
echo 'O=tmp.bam' >> tmp.args
${java_exe} -jar /path/to/picard.jar MergeSamFiles OPTIONS_FILE=tmp.args
rm tmp.interval
rm tmp.args
/path/to/freebayes -f "${REF}" \\
--use-best-n-alleles 3 --standard-filters --min-coverage 5 \\
--bam tmp.bam \\
--region "${chrom}:${start}-${end}" \\
--strict-vcf \\
--vcf "freebayes.${chrom}_${start}_${end}.vcf"
bgzip -f "freebayes.${chrom}_${start}_${end}.vcf"
rm -f tmp.*
fi
"""
}
process callBamHapCaller {
tag "call hapcaller (${chrom}:${start}-${end}) and ${bam_list}"
memory '9 GB'
errorStrategy 'retry'
when:
params.callers.contains("hapcaller")
input:
set chrom,start,end,bam_list from cleanbed1.
splitCsv(header: false,sep:'\t',strip:true).
combine(bam_list)
output:
set chrom,start,end,file("hapcaller.${chrom}_${start}_${end}.vcf.gz"),val("hapcaller") into called_vcfs_hapcaller
script:
"""
if [ "${params.test_flag}" == "T" ]
then
touch "hapcaller.${chrom}_${start}_${end}.vcf.gz"
else
echo "${chrom} ${start} ${end}" > tmp.bed
${java_exe} -jar /path/to/picard.jar BedToIntervalList I=tmp.bed O=tmp.interval SD=`echo "${REF}" | sed 's/.fasta\$/.dict/'`
rm tmp.bed
sed 's/^/I=/' ${bam_list} > tmp.args
echo 'AS=true' >> tmp.args
echo 'RGN=tmp.interval' >> tmp.args
echo 'CREATE_INDEX=true' >> tmp.args
echo 'O=tmp.bam' >> tmp.args
${java_exe} -jar /path/to/picard.jar MergeSamFiles OPTIONS_FILE=tmp.args
rm tmp.interval
rm tmp.args
${java_exe} -Xmx9g -Djava.io.tmpdir=. -jar ${gatk_jar} \\
-T HaplotypeCaller -R "${REF}" \\
-I tmp.bam \\
--log_to_file tmp.hapcaller.log \\
--max_alternate_alleles 3 \\
-L "${chrom}:${start}-${end}" \\
--dbsnp/path/to/dbsnp_138.b37.vcf \\
-o "hapcaller.${chrom}_${start}_${end}.vcf.gz"
rm -f tmp.* *.tbi
fi
"""
}
process vcfListPerContig {
tag "list for ${vcfVector.size()} vcf(s) called w. ${caller} chr. ${contig}"
input:
set contig,caller,vcfVector from distinct_contigs_file.
splitText().
map{C->C.trim()}.
combine(called_vcfs_samtools.concat(called_vcfs_freebayes).concat(called_vcfs_hapcaller)).
filter{L->L[0].equals(L[1])}.
map{L->[ [L[0],L[5]], [L[1],L[2],L[3],L[4]] ]}.
groupTuple().
map{L->[L[0][0],L[0][1],L[1]]}
output:
set contig,caller,file("vcf.${caller}.${contig}.list") into vcf_list
script:
lines = vcfVector.stream().map{
T->T.stream().map{S->S.toString()}.collect(java.util.stream.Collectors.joining(","))
}.collect(java.util.stream.Collectors.joining("\n"))
"""
cat << __EOF__ | LC_ALL=C sort -t, -k2,2n | cut -d, -f 4 > "vcf.${caller}.${contig}.list"
${lines}
__EOF__
grep -m1 -F "${caller}" "vcf.${caller}.${contig}.list" > /dev/null
grep -m1 -F ".vcf.gz" "vcf.${caller}.${contig}.list" > /dev/null
"""
}
process mergeVcfPerContigSamtoolsOrFreebayes {
tag "making list from ${vcflist} for chr. ${contig} w. ${caller}"
input:
set contig,caller,vcflist from vcf_list
output:
set contig,caller,file("merged.${caller}.${contig}.vcf.gz") into merged_contig_vcf_samtools_freebayes
script:
"""
function dogather {
${java_exe} \\
-jar /path/to/picard.jar GatherVcfs \\
I=\$1 O=\$2
}
if [ "${params.test_flag}" == "T" ]
then
touch merged.${caller}.${contig}.vcf.gz
else
split --lines=1000 -a 6 --numeric-suffixes "${vcflist}" __SL
rm -f _final.list
find . -type f -name "__SL*" | sort -V | while read F
do
mv "\${F}" "\${F}.list"
dogather "\${F}.list" "\${F}.vcf"
echo "\${F}.vcf" >> _final.list
done
dogather _final.list merged.${caller}.${contig}.vcf.gz
rm -f _final.list __SL*
fi
"""
}
process fetch_cardiogenes {
tag "fetch cardiogenes"
output:
file("cardiogene.txt") into cardiogene_out
script:
"""
if [ "${params.test_flag}" == "T" ]
then
echo SCN5A > cardiogene.txt
else
wget -O - "ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/bhf-ucl/gene_association.goa_bhf-ucl.gz" | \\
gunzip -c | grep -F 'taxon:9606' | cut -f 3 | uniq | LC_ALL=C sort | uniq > cardiogene.txt
grep -m1 SCN5A cardiogene.txt
fi
"""
}
process annotateVcfPerContig {
tag "annotation ${vcfin} for ${contig}"
echo true
input:
set contig,caller,vcfin from merged_contig_vcf_samtools_freebayes
file cardiogenes from cardiogene_out
val vec from normBed2.collect()
output:
set contig,caller,file("annotation.${caller}.${contig}.vcf.gz"),file("annotation.${caller}.${contig}.vcf.gz.tbi") into contig_annot_vcf
script:
"""
if [ "${params.test_flag}" == "T" ]
then
touch annotation.${caller}.${contig}.vcf.gz annotation.${caller}.${contig}.vcf.gz.tbi
else
set -x
gunzip -c "${vcfin}" |\\
awk -F '\\t' '/^#CHROM/ {printf("##DESCRIPTION=Calling with ${caller}. ${params.description}\\n");} {print}' |\\
${java_exe} \\
-jar ${jvarkitjar("vcfgnomad")} \\
-ac -m /path/to/gnomad/vcf.manifest |\\
${java_exe} \\
-jar ${jvarkitjar("vcfbigwig")} \\
--aggregate avg --transform ensembl2ucsc -tag duke35 --bigwig "/path/to/wgEncodeDukeMapabilityUniqueness35bp.bigWig" |\\
${java_exe} \\
-jar ${jvarkitjar("vcfpolyx")} \\
-R ${REF} --filter 10 |\\
${java_exe} \\
-jar ${jvarkitjar("vcffilterjdk")} \\
--filter GNOMAD_AF_NFE_1_1000 -e 'return !variant.hasAttribute("gnomad_genome_AF_NFE") || variant.getAttributeAsList("gnomad_genome_AF_NFE").stream().map(O->O==null || O.toString().equals(".")?0:Double.parseDouble(O.toString())).anyMatch(V->V< 1/1000.0);' |\\
${java_exe} \\
-jar ${jvarkitjar("vcffilterjdk")} \\
--filter ALL_HOM_VAR -e 'return variant.getNSamples() <= 1 || !variant.getGenotypes().stream().filter(G->G.isCalled()).allMatch(G->G.isHomVar());' |\\
${java_exe} \\
-jar ${jvarkitjar("vcffilterjdk")} \\
--filter LOW_DEPTH_10 -e 'return variant.getGenotypes().stream().filter(G->G.isCalled() && G.hasDP()).mapToInt(G->G.getDP()).average().orElse(0)>=10;' |\\
${java_exe} -Djava.io.tmpdir=. \\
-jar /patg/to/snpEff.jar ann \\
-c /patg/to/snpEff.config GRCh37.75 -nodownload -noStats |\\
${java_exe} -jar ${jvarkitjar("vcffilterso")} \\
--filterout NOT_PROTEIN_ALTERING --acn "SO:0001818" |\\
${java_exe} -jar ${jvarkitjar("vcfburdenfiltergenes")} \\
--filter NOT_CARDIOGEN -g "${cardiogenes}" > jeter.vcf
${java_exe} -Djava.io.tmpdir=. -jar ${gatk_jar} \\
-T VariantAnnotator -R ${REF} --dbsnp/path/to/dbsnp_138.b37.vcf --variant jeter.vcf -L jeter.vcf -o jeter2.vcf
rm jeter.vcf
${java_exe} -Djava.io.tmpdir=. -jar ${gatk_jar} \\
-T VariantFiltration -R ${REF} -o jeter3.vcf --variant jeter2.vcf -L jeter2.vcf \\
--maskName MapabilityConsensusExcludable --mask:BED /path/to/wgEncodeDacMapabilityConsensusExcludable_nochrprefix.bed.gz
rm jeter2.vcf
${java_exe} -Djava.io.tmpdir=. -jar ${gatk_jar} \\
-T VariantFiltration -R ${REF} -o jeter4.vcf --variant jeter3.vcf -L jeter3.vcf \\
--filterExpression 'duke35<1.0' --filterName "DUKE35_LT1"
rm jeter3.vcf
echo "${vec.join(" ")}" | tr " " "\\n" | paste - - | while read FN FF
do
${java_exe} -Djava.io.tmpdir=. -jar ${gatk_jar} \\
-T VariantFiltration -R ${REF} -o jeter5.vcf --variant jeter4.vcf -L jeter4.vcf \\
--filterNotInMask \\
--maskName "NOT_IN_\${FN}" --mask:BED \${FF}
mv jeter5.vcf jeter4.vcf
mv jeter5.vcf.idx jeter4.vcf.idx
done
${java_exe} -Djava.io.tmpdir=. -jar ${gatk_jar} \\
-T VariantFiltration -R ${REF} -o jeter5.vcf --variant jeter4.vcf -L jeter4.vcf \\
--filterExpression 'duke35<1.0' --filterName "DUKE35_LT1"
cp jeter5.vcf annotation.${caller}.${contig}.vcf
rm jeter*
bgzip -f annotation.${caller}.${contig}.vcf
tabix -f -p vcf annotation.${caller}.${contig}.vcf.gz
fi
"""
}
process makeVcfListAnnotContigs {
tag "making list from ${vcfVector.size()} w. ${caller}"
input:
set caller,vcfVector from contig_annot_vcf.
map{L->[L[1],L[0]+","+L[2]]}.
groupTuple()
output:
set caller,file("vcf.${caller}.list") into vcf_all_list
script:
"""
cat << __EOF__ | LC_ALL=C sort -t, -k1,1 > tmp2.txt
${vcfVector.join("\n")}
__EOF__
awk -F '\\t' '{printf("%s,%d\\n",\$1,NR);}' "${REF}.fai" | sort -t, -k1,1 > tmp1.txt
join -t , -1 1 -2 1 tmp1.txt tmp2.txt | sort -t, -k2,2n | cut -d, -f 3 > vcf.${caller}.list
rm -f tmp1.txt tmp2.txt
grep -m1 '.vcf.gz\$' vcf.${caller}.list > /dev/null
"""
}
process mergeVcf {
tag "making list from ${vcflist} "
input:
set caller,vcflist from vcf_all_list
output:
set caller,file("${params.prefix}.${caller}.vcf.gz"),file("${params.prefix}.${caller}.vcf.gz.tbi") into merged_vcf
script:
"""
if [ "${params.test_flag}" == "T" ]
then
touch "${params.prefix}.${caller}.vcf.gz" "${params.prefix}.${caller}.vcf.gz.tbi"
else
${java_exe} \\
-jar /path/to/picard.jar GatherVcfs \\
I=${vcflist} O=${params.prefix}.${caller}.vcf.gz
tabix -f -p vcf ${params.prefix}.${caller}.vcf.gz
fi
"""
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment