Created
May 29, 2018 07:18
-
-
Save lindenb/f4443b806b8118a478ff2ba387b02dd7 to your computer and use it in GitHub Desktop.
test nextflow anonymized , multi-calling, 1 vcf per bed line, 3 methods hapcaller|samtools|freebayes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REF="/path/to/human_g1k_v37.fasta" | |
java_exe="java" | |
gatk_jar="/path/to/GenomeAnalysisTK.jar" | |
params.test_flag="F" | |
params.prefix="20180515.NEXTFLOW" | |
params.callers=["samtools","freebayes","hapcaller"] | |
params.description="${params.prefix} : Pierre test Nextflow " | |
captures = Channel.from( | |
"Capture01", "capture01.bed", | |
"Capture02", "capture02.bed", | |
"Capture03", "capture03.bed", | |
"Capture04", "capture04.bed", | |
). | |
buffer( size: 2 ). | |
map{T->[T[0],file(T[1])]}. | |
dump() | |
String jvarkitjar(String name) { " /path/to/jvarkit-git/${name}.jar " } | |
process makeBamList { | |
tag "bam list" | |
output: | |
file("bam.list") into bam_list | |
script: | |
""" | |
find dir1 dir2 dir3 dir3 dir4 -type f \ | |
-name "*final.bam" > bam.list | |
if [ "${params.test_flag}" == "T" ] | |
then | |
cat bam.list | shuf | head > tmp && mv tmp bam.list | |
fi | |
grep -m1 -F "_final.bam" bam.list > /dev/null | |
""" | |
} | |
process normalizeBed { | |
tag "normalizeBed ${bedName}:${bedFile}" | |
echo true | |
input: | |
set val(bedName),file(bedFile) from captures | |
output: | |
set bedName,file("${bedName}.bed.gz") into normBed1,normBed2 | |
file("${bedName}.bed.gz.tbi") | |
script: | |
""" | |
cat "${bedFile}" |\\ | |
tr -d '\\r' |\\ | |
grep -v -E '^(browser|track)' |\\ | |
cut -f 1,2,3 | sed 's/^chr//' |\\ | |
LC_ALL=C sort -t \$'\\t' -k1,1 -k2,2n |\\ | |
uniq | | |
bgzip > "${bedName}.bed.gz" | |
tabix -f -p bed "${bedName}.bed.gz" | |
""" | |
} | |
process mergeBeds { | |
tag "Merge ${beds.size()} capture(s)" | |
input: | |
val beds from normBed1.map{T->T[1]}.flatten().collect() | |
output: | |
file("capture.bed") into cleanbed1 | |
script: | |
""" | |
gunzip -c ${beds.join(" ")} | | |
LC_ALL=C sort -t \$'\\t' -k1,1 -k2,2n |\\ | |
uniq |\\ | |
bedtools slop -b 1000 -g "${REF}.fai" |\\ | |
bedtools merge -d 1000 -i - > capture.bed | |
if [ "${params.test_flag}" == "T" ] | |
then | |
cat capture.bed | shuf | head -n 30 > tmp && mv tmp capture.bed | |
fi | |
""" | |
} | |
process distinctContigs { | |
tag "distinct contigs" | |
input: | |
file capture from cleanbed1 | |
output: | |
file ("contigs.list") into distinct_contigs_file | |
script: | |
""" | |
cut -f 1 "${capture}" | uniq | LC_ALL=C sort -V |uniq > contigs.list | |
test -s contigs.list | |
""" | |
} | |
process callBamSamtools { | |
tag "call samtools (${chrom}:${start}-${end}) and ${bam_list}" | |
when: | |
params.callers.contains("samtools") | |
input: | |
set chrom,start,end,bam_list from cleanbed1. | |
splitCsv(header: false,sep:'\t',strip:true). | |
combine(bam_list) | |
output: | |
set chrom,start,end,file("samtools.${chrom}_${start}_${end}.vcf.gz"),val("samtools") into called_vcfs_samtools | |
script: | |
""" | |
if [ "${params.test_flag}" == "T" ] | |
then | |
touch "samtools.${chrom}_${start}_${end}.vcf.gz" | |
else | |
bcftools mpileup -Ou -f "${REF}" \\ | |
--bam-list "${bam_list}" \\ | |
--region "${chrom}:${start}-${end}" \\ | |
--annotate 'FORMAT/AD,FORMAT/ADF,FORMAT/ADR,FORMAT/DP,FORMAT/SP,INFO/AD,INFO/ADF,INFO/ADR' \\ | |
--redo-BAQ --adjust-MQ 50 --min-MQ 30 \\ | |
--count-orphans \\ | |
--max-depth 10000 \\ | |
--max-idepth 10000 |\\ | |
bcftools call \\ | |
--ploidy GRCh37 \\ | |
--multiallelic-caller \\ | |
--variants-only -O z -o "samtools.${chrom}_${start}_${end}.vcf.gz" | |
fi | |
""" | |
} | |
process callBamFreebayes { | |
tag "call freebayes (${chrom}:${start}-${end}) and ${bam_list}" | |
when: | |
params.callers.contains("freebayes") | |
input: | |
set chrom,start,end,bam_list from cleanbed1. | |
splitCsv(header: false,sep:'\t',strip:true). | |
combine(bam_list) | |
output: | |
set chrom,start,end,file("freebayes.${chrom}_${start}_${end}.vcf.gz"),val("freebayes") into called_vcfs_freebayes | |
script: | |
""" | |
if [ "${params.test_flag}" == "T" ] | |
then | |
touch "freebayes.${chrom}_${start}_${end}.vcf.gz" | |
else | |
## ça bug parce que freebayes veut des read-groups disctint/samples | |
echo "${chrom} ${start} ${end}" > tmp.bed | |
${java_exe} -jar /path/to/picard.jar BedToIntervalList I=tmp.bed O=tmp.interval SD=`echo "${REF}" | sed 's/.fasta\$/.dict/'` | |
rm tmp.bed | |
sed 's/^/I=/' ${bam_list} > tmp.args | |
echo 'AS=true' >> tmp.args | |
echo 'RGN=tmp.interval' >> tmp.args | |
echo 'CREATE_INDEX=true' >> tmp.args | |
echo 'O=tmp.bam' >> tmp.args | |
${java_exe} -jar /path/to/picard.jar MergeSamFiles OPTIONS_FILE=tmp.args | |
rm tmp.interval | |
rm tmp.args | |
/path/to/freebayes -f "${REF}" \\ | |
--use-best-n-alleles 3 --standard-filters --min-coverage 5 \\ | |
--bam tmp.bam \\ | |
--region "${chrom}:${start}-${end}" \\ | |
--strict-vcf \\ | |
--vcf "freebayes.${chrom}_${start}_${end}.vcf" | |
bgzip -f "freebayes.${chrom}_${start}_${end}.vcf" | |
rm -f tmp.* | |
fi | |
""" | |
} | |
process callBamHapCaller { | |
tag "call hapcaller (${chrom}:${start}-${end}) and ${bam_list}" | |
memory '9 GB' | |
errorStrategy 'retry' | |
when: | |
params.callers.contains("hapcaller") | |
input: | |
set chrom,start,end,bam_list from cleanbed1. | |
splitCsv(header: false,sep:'\t',strip:true). | |
combine(bam_list) | |
output: | |
set chrom,start,end,file("hapcaller.${chrom}_${start}_${end}.vcf.gz"),val("hapcaller") into called_vcfs_hapcaller | |
script: | |
""" | |
if [ "${params.test_flag}" == "T" ] | |
then | |
touch "hapcaller.${chrom}_${start}_${end}.vcf.gz" | |
else | |
echo "${chrom} ${start} ${end}" > tmp.bed | |
${java_exe} -jar /path/to/picard.jar BedToIntervalList I=tmp.bed O=tmp.interval SD=`echo "${REF}" | sed 's/.fasta\$/.dict/'` | |
rm tmp.bed | |
sed 's/^/I=/' ${bam_list} > tmp.args | |
echo 'AS=true' >> tmp.args | |
echo 'RGN=tmp.interval' >> tmp.args | |
echo 'CREATE_INDEX=true' >> tmp.args | |
echo 'O=tmp.bam' >> tmp.args | |
${java_exe} -jar /path/to/picard.jar MergeSamFiles OPTIONS_FILE=tmp.args | |
rm tmp.interval | |
rm tmp.args | |
${java_exe} -Xmx9g -Djava.io.tmpdir=. -jar ${gatk_jar} \\ | |
-T HaplotypeCaller -R "${REF}" \\ | |
-I tmp.bam \\ | |
--log_to_file tmp.hapcaller.log \\ | |
--max_alternate_alleles 3 \\ | |
-L "${chrom}:${start}-${end}" \\ | |
--dbsnp/path/to/dbsnp_138.b37.vcf \\ | |
-o "hapcaller.${chrom}_${start}_${end}.vcf.gz" | |
rm -f tmp.* *.tbi | |
fi | |
""" | |
} | |
process vcfListPerContig { | |
tag "list for ${vcfVector.size()} vcf(s) called w. ${caller} chr. ${contig}" | |
input: | |
set contig,caller,vcfVector from distinct_contigs_file. | |
splitText(). | |
map{C->C.trim()}. | |
combine(called_vcfs_samtools.concat(called_vcfs_freebayes).concat(called_vcfs_hapcaller)). | |
filter{L->L[0].equals(L[1])}. | |
map{L->[ [L[0],L[5]], [L[1],L[2],L[3],L[4]] ]}. | |
groupTuple(). | |
map{L->[L[0][0],L[0][1],L[1]]} | |
output: | |
set contig,caller,file("vcf.${caller}.${contig}.list") into vcf_list | |
script: | |
lines = vcfVector.stream().map{ | |
T->T.stream().map{S->S.toString()}.collect(java.util.stream.Collectors.joining(",")) | |
}.collect(java.util.stream.Collectors.joining("\n")) | |
""" | |
cat << __EOF__ | LC_ALL=C sort -t, -k2,2n | cut -d, -f 4 > "vcf.${caller}.${contig}.list" | |
${lines} | |
__EOF__ | |
grep -m1 -F "${caller}" "vcf.${caller}.${contig}.list" > /dev/null | |
grep -m1 -F ".vcf.gz" "vcf.${caller}.${contig}.list" > /dev/null | |
""" | |
} | |
process mergeVcfPerContigSamtoolsOrFreebayes { | |
tag "making list from ${vcflist} for chr. ${contig} w. ${caller}" | |
input: | |
set contig,caller,vcflist from vcf_list | |
output: | |
set contig,caller,file("merged.${caller}.${contig}.vcf.gz") into merged_contig_vcf_samtools_freebayes | |
script: | |
""" | |
function dogather { | |
${java_exe} \\ | |
-jar /path/to/picard.jar GatherVcfs \\ | |
I=\$1 O=\$2 | |
} | |
if [ "${params.test_flag}" == "T" ] | |
then | |
touch merged.${caller}.${contig}.vcf.gz | |
else | |
split --lines=1000 -a 6 --numeric-suffixes "${vcflist}" __SL | |
rm -f _final.list | |
find . -type f -name "__SL*" | sort -V | while read F | |
do | |
mv "\${F}" "\${F}.list" | |
dogather "\${F}.list" "\${F}.vcf" | |
echo "\${F}.vcf" >> _final.list | |
done | |
dogather _final.list merged.${caller}.${contig}.vcf.gz | |
rm -f _final.list __SL* | |
fi | |
""" | |
} | |
process fetch_cardiogenes { | |
tag "fetch cardiogenes" | |
output: | |
file("cardiogene.txt") into cardiogene_out | |
script: | |
""" | |
if [ "${params.test_flag}" == "T" ] | |
then | |
echo SCN5A > cardiogene.txt | |
else | |
wget -O - "ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/bhf-ucl/gene_association.goa_bhf-ucl.gz" | \\ | |
gunzip -c | grep -F 'taxon:9606' | cut -f 3 | uniq | LC_ALL=C sort | uniq > cardiogene.txt | |
grep -m1 SCN5A cardiogene.txt | |
fi | |
""" | |
} | |
process annotateVcfPerContig { | |
tag "annotation ${vcfin} for ${contig}" | |
echo true | |
input: | |
set contig,caller,vcfin from merged_contig_vcf_samtools_freebayes | |
file cardiogenes from cardiogene_out | |
val vec from normBed2.collect() | |
output: | |
set contig,caller,file("annotation.${caller}.${contig}.vcf.gz"),file("annotation.${caller}.${contig}.vcf.gz.tbi") into contig_annot_vcf | |
script: | |
""" | |
if [ "${params.test_flag}" == "T" ] | |
then | |
touch annotation.${caller}.${contig}.vcf.gz annotation.${caller}.${contig}.vcf.gz.tbi | |
else | |
set -x | |
gunzip -c "${vcfin}" |\\ | |
awk -F '\\t' '/^#CHROM/ {printf("##DESCRIPTION=Calling with ${caller}. ${params.description}\\n");} {print}' |\\ | |
${java_exe} \\ | |
-jar ${jvarkitjar("vcfgnomad")} \\ | |
-ac -m /path/to/gnomad/vcf.manifest |\\ | |
${java_exe} \\ | |
-jar ${jvarkitjar("vcfbigwig")} \\ | |
--aggregate avg --transform ensembl2ucsc -tag duke35 --bigwig "/path/to/wgEncodeDukeMapabilityUniqueness35bp.bigWig" |\\ | |
${java_exe} \\ | |
-jar ${jvarkitjar("vcfpolyx")} \\ | |
-R ${REF} --filter 10 |\\ | |
${java_exe} \\ | |
-jar ${jvarkitjar("vcffilterjdk")} \\ | |
--filter GNOMAD_AF_NFE_1_1000 -e 'return !variant.hasAttribute("gnomad_genome_AF_NFE") || variant.getAttributeAsList("gnomad_genome_AF_NFE").stream().map(O->O==null || O.toString().equals(".")?0:Double.parseDouble(O.toString())).anyMatch(V->V< 1/1000.0);' |\\ | |
${java_exe} \\ | |
-jar ${jvarkitjar("vcffilterjdk")} \\ | |
--filter ALL_HOM_VAR -e 'return variant.getNSamples() <= 1 || !variant.getGenotypes().stream().filter(G->G.isCalled()).allMatch(G->G.isHomVar());' |\\ | |
${java_exe} \\ | |
-jar ${jvarkitjar("vcffilterjdk")} \\ | |
--filter LOW_DEPTH_10 -e 'return variant.getGenotypes().stream().filter(G->G.isCalled() && G.hasDP()).mapToInt(G->G.getDP()).average().orElse(0)>=10;' |\\ | |
${java_exe} -Djava.io.tmpdir=. \\ | |
-jar /patg/to/snpEff.jar ann \\ | |
-c /patg/to/snpEff.config GRCh37.75 -nodownload -noStats |\\ | |
${java_exe} -jar ${jvarkitjar("vcffilterso")} \\ | |
--filterout NOT_PROTEIN_ALTERING --acn "SO:0001818" |\\ | |
${java_exe} -jar ${jvarkitjar("vcfburdenfiltergenes")} \\ | |
--filter NOT_CARDIOGEN -g "${cardiogenes}" > jeter.vcf | |
${java_exe} -Djava.io.tmpdir=. -jar ${gatk_jar} \\ | |
-T VariantAnnotator -R ${REF} --dbsnp/path/to/dbsnp_138.b37.vcf --variant jeter.vcf -L jeter.vcf -o jeter2.vcf | |
rm jeter.vcf | |
${java_exe} -Djava.io.tmpdir=. -jar ${gatk_jar} \\ | |
-T VariantFiltration -R ${REF} -o jeter3.vcf --variant jeter2.vcf -L jeter2.vcf \\ | |
--maskName MapabilityConsensusExcludable --mask:BED /path/to/wgEncodeDacMapabilityConsensusExcludable_nochrprefix.bed.gz | |
rm jeter2.vcf | |
${java_exe} -Djava.io.tmpdir=. -jar ${gatk_jar} \\ | |
-T VariantFiltration -R ${REF} -o jeter4.vcf --variant jeter3.vcf -L jeter3.vcf \\ | |
--filterExpression 'duke35<1.0' --filterName "DUKE35_LT1" | |
rm jeter3.vcf | |
echo "${vec.join(" ")}" | tr " " "\\n" | paste - - | while read FN FF | |
do | |
${java_exe} -Djava.io.tmpdir=. -jar ${gatk_jar} \\ | |
-T VariantFiltration -R ${REF} -o jeter5.vcf --variant jeter4.vcf -L jeter4.vcf \\ | |
--filterNotInMask \\ | |
--maskName "NOT_IN_\${FN}" --mask:BED \${FF} | |
mv jeter5.vcf jeter4.vcf | |
mv jeter5.vcf.idx jeter4.vcf.idx | |
done | |
${java_exe} -Djava.io.tmpdir=. -jar ${gatk_jar} \\ | |
-T VariantFiltration -R ${REF} -o jeter5.vcf --variant jeter4.vcf -L jeter4.vcf \\ | |
--filterExpression 'duke35<1.0' --filterName "DUKE35_LT1" | |
cp jeter5.vcf annotation.${caller}.${contig}.vcf | |
rm jeter* | |
bgzip -f annotation.${caller}.${contig}.vcf | |
tabix -f -p vcf annotation.${caller}.${contig}.vcf.gz | |
fi | |
""" | |
} | |
process makeVcfListAnnotContigs { | |
tag "making list from ${vcfVector.size()} w. ${caller}" | |
input: | |
set caller,vcfVector from contig_annot_vcf. | |
map{L->[L[1],L[0]+","+L[2]]}. | |
groupTuple() | |
output: | |
set caller,file("vcf.${caller}.list") into vcf_all_list | |
script: | |
""" | |
cat << __EOF__ | LC_ALL=C sort -t, -k1,1 > tmp2.txt | |
${vcfVector.join("\n")} | |
__EOF__ | |
awk -F '\\t' '{printf("%s,%d\\n",\$1,NR);}' "${REF}.fai" | sort -t, -k1,1 > tmp1.txt | |
join -t , -1 1 -2 1 tmp1.txt tmp2.txt | sort -t, -k2,2n | cut -d, -f 3 > vcf.${caller}.list | |
rm -f tmp1.txt tmp2.txt | |
grep -m1 '.vcf.gz\$' vcf.${caller}.list > /dev/null | |
""" | |
} | |
process mergeVcf { | |
tag "making list from ${vcflist} " | |
input: | |
set caller,vcflist from vcf_all_list | |
output: | |
set caller,file("${params.prefix}.${caller}.vcf.gz"),file("${params.prefix}.${caller}.vcf.gz.tbi") into merged_vcf | |
script: | |
""" | |
if [ "${params.test_flag}" == "T" ] | |
then | |
touch "${params.prefix}.${caller}.vcf.gz" "${params.prefix}.${caller}.vcf.gz.tbi" | |
else | |
${java_exe} \\ | |
-jar /path/to/picard.jar GatherVcfs \\ | |
I=${vcflist} O=${params.prefix}.${caller}.vcf.gz | |
tabix -f -p vcf ${params.prefix}.${caller}.vcf.gz | |
fi | |
""" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment