Skip to content

Instantly share code, notes, and snippets.

@caryan
Created May 30, 2016 00:29
Show Gist options
  • Save caryan/1a5624d8539c83d01cef2663b3bc2e7d to your computer and use it in GitHub Desktop.
Save caryan/1a5624d8539c83d01cef2663b3bc2e7d to your computer and use it in GitHub Desktop.
Julia script wrapping a Picard and GATK variant calling pipeline
using Glob
using DataFrames
const PICARD = "/home/cryan/Downloads/picard-tools-2.1.0/picard.jar"
const GATK = "/home/cryan/Downloads/GATK/GenomeAnalysisTK.jar"
function prepare_reference(ref)
#index the reference for bwa
run(`bwa index $ref.fasta`)
#sort the reference TODO: does it need to be sorted? .fai file seems minimal
run(`samtools faidx $ref.fasta`)
#create sequence dictionary (required for IndelRealigner)
run(`java -jar $PICARD CreateSequenceDictionary REFERENCE=$ref.fasta OUTPUT=$ref.dict`)
end
function call_variants(ref, reads)
#align reads to reference and call variants
#assumes that reads are in $reads_R1_001.fastq.gz and $reads_R2_001.fastq.gz
#align reads using bwa
run(pipeline(`bwa mem $ref.fasta $(reads)_R1_001.fastq.gz $(reads)_R2_001.fastq.gz`, "$reads.aln.sam"))
#Sort SAM file, add fake ReadGroup IDs (otherwise will fail at GATK RealignerTargetCreator below) and convert to bam
strain = match(r"AZPAE\d{5}", reads).match
run(`java -jar $PICARD AddOrReplaceReadGroups I=$reads.aln.sam O=$reads.sorted.bam
RGLB=kos1 RGPL=illumina RGPU=unit1 RGSM=$strain SORT_ORDER=coordinate`)
# mark duplicates
run(`java -jar $PICARD MarkDuplicates I=$reads.sorted.bam O=$reads.dedup.bam METRICS_FILE=metrics.txt`)
# build bam index (requires bam file sorted in coordinate order)
run(`java -jar $PICARD BuildBamIndex I=$reads.dedup.bam`)
#Create realignment targets for IndelRealigner below
run(`java -jar $GATK -T RealignerTargetCreator -R $ref.fasta -I $reads.dedup.bam -o $reads.targetintervals.list`)
# Indel realignment
#needs a reference dictionary (ref.dict for ref.fasta)
run(`java -jar $GATK -T IndelRealigner -R $ref.fasta -I $reads.dedup.bam
-targetIntervals $reads.targetintervals.list -o $reads.realigned.bam`)
#Call variants (takes ~10 minutes to run on a pair of 150M fastq.gz file )
run(`java -jar $GATK -T HaplotypeCaller -R $ref.fasta -I $reads.realigned.bam
-ploidy 1 -o $reads.vcf`)
end
function call_variants_directory(dir, ref)
#Loop over call_variants for multiple strain reads in a directory
prepare_reference(joinpath(dir,ref))
#extract the first part before the R1 or R2 and remove duplicates
fastq_files = glob("*.fastq.gz", dir)
strains = map(x -> match(r"(AZPAE\d{5}.*)_R[12]_001.fastq.gz", x).captures[1], fastq_files) |> unique
@parallel for strain in strains
call_variants(joinpath(dir,ref), joinpath(dir,strain))
end
end
function gatk_variants_table_to_clc(df_gatk::DataFrame)
#Helper function to convert a variant table from GATK to CLC style
df = DataFrame(Ref_Pos_=Int[], Type=AbstractString[], Length=Int[], Reference=AbstractString[], Allele=AbstractString[])
for idx = 1:size(df_gatk,1)
#Check for contiguous SNV to turn into MNV
if (idx > 1) && ((df[end, :Type] == "SNV") || (df[end, :Type] == "MNV")) &&
(df_gatk[idx, :POS] === df[end,:Ref_Pos_]+df[end,:Length]) &&
(length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) === 1)
df[end, :Type] = "MNV"
df[end, :Length] += 1
df[end, :Reference] *= df_gatk[idx, :REF]
df[end, :Allele] *= df_gatk[idx, :ALT]
#Look for insertion
elseif (length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) > 1)
push!(df, [df_gatk[idx, :POS]+1, "Insertion", length(df_gatk[idx, :ALT]) - 1, "-", df_gatk[idx, :ALT][2:end]] )
#Look for deletion
elseif (length(df_gatk[idx, :REF]) > 1) && (length(df_gatk[idx, :ALT]) === 1)
push!(df, [df_gatk[idx, :POS]+1, "Deletion", length(df_gatk[idx, :REF]) - 1, df_gatk[idx, :REF][2:end], "-"] )
#Otherwise push on the SNP
else
push!(df, [df_gatk[idx, :POS], "SNV", 1, df_gatk[idx, :REF], df_gatk[idx, :ALT]])
end
end
return df
end
function consolidate_variant_calls(dir)
#consolidate variant calls into a single dataframe for comparative analysis
#Grab all the variant call files and convert to tsv tables
vcf_files = glob(joinpath(dir, "*.vcf"))
ref_file = joinpath(dir, "PAO1_reference.fasta")
for f in vcf_files
base_name = splitext(f)[1]
run(`java -jar $GATK -T VariantsToTable -R $ref_file -V $f
-F POS -F REF -F ALT -F QUAL -o $base_name.variants.tsv`)
end
#Load into data frames, convert to CLC style
dfs = []
tsv_files = glob(joinpath(dir, "*.variants.tsv"))
for f in tsv_files
df = readtable(f)
strain_name = match(r"(AZPAE\d{5})", f).captures[1]
df = gatk_variants_table_to_clc(df)
rename!(df, :Allele, Symbol(strain_name))
# writetable(f[1:end-3] * "clc.tsv", df)
push!(dfs, df)
end
#join to a single data frame
main_df = dfs[1]
for ct = 2:length(dfs)
# main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length, :Reference], kind=:outer)
#work around [this FIXME](https://github.com/JuliaStats/DataFrames.jl/blob/039432c62a7d01285e93f91a2363eb5caa86036c/src/abstractdataframe/join.jl#L97)
main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length], kind=:outer)
#check the references are the same
for ct = 1:size(main_df,1)
if !isna(main_df[ct, :Reference]) && !isna(main_df[ct, :Reference_1])
if(((main_df[ct,:Reference] != main_df[ct,:Reference_1])))
error("Rows don't match at $ct")
end
end
if isna(main_df[ct, :Reference])
main_df[ct, :Reference] = main_df[ct, :Reference_1]
end
end
delete!(main_df, :Reference_1)
end
sort!(main_df, cols=:Ref_Pos_)
writetable(joinpath(dir, "variant_summary.tsv"), main_df)
return main_df
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment