Created
May 30, 2016 00:29
-
-
Save caryan/1a5624d8539c83d01cef2663b3bc2e7d to your computer and use it in GitHub Desktop.
Julia script wrapping a Picard and GATK variant calling pipeline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using Glob | |
using DataFrames | |
const PICARD = "/home/cryan/Downloads/picard-tools-2.1.0/picard.jar" | |
const GATK = "/home/cryan/Downloads/GATK/GenomeAnalysisTK.jar" | |
function prepare_reference(ref) | |
#index the reference for bwa | |
run(`bwa index $ref.fasta`) | |
#sort the reference TODO: does it need to be sorted? .fai file seems minimal | |
run(`samtools faidx $ref.fasta`) | |
#create sequence dictionary (required for IndelRealigner) | |
run(`java -jar $PICARD CreateSequenceDictionary REFERENCE=$ref.fasta OUTPUT=$ref.dict`) | |
end | |
function call_variants(ref, reads) | |
#align reads to reference and call variants | |
#assumes that reads are in $reads_R1_001.fastq.gz and $reads_R2_001.fastq.gz | |
#align reads using bwa | |
run(pipeline(`bwa mem $ref.fasta $(reads)_R1_001.fastq.gz $(reads)_R2_001.fastq.gz`, "$reads.aln.sam")) | |
#Sort SAM file, add fake ReadGroup IDs (otherwise will fail at GATK RealignerTargetCreator below) and convert to bam | |
strain = match(r"AZPAE\d{5}", reads).match | |
run(`java -jar $PICARD AddOrReplaceReadGroups I=$reads.aln.sam O=$reads.sorted.bam | |
RGLB=kos1 RGPL=illumina RGPU=unit1 RGSM=$strain SORT_ORDER=coordinate`) | |
# mark duplicates | |
run(`java -jar $PICARD MarkDuplicates I=$reads.sorted.bam O=$reads.dedup.bam METRICS_FILE=metrics.txt`) | |
# build bam index (requires bam file sorted in coordinate order) | |
run(`java -jar $PICARD BuildBamIndex I=$reads.dedup.bam`) | |
#Create realignment targets for IndelRealigner below | |
run(`java -jar $GATK -T RealignerTargetCreator -R $ref.fasta -I $reads.dedup.bam -o $reads.targetintervals.list`) | |
# Indel realignment | |
#needs a reference dictionary (ref.dict for ref.fasta) | |
run(`java -jar $GATK -T IndelRealigner -R $ref.fasta -I $reads.dedup.bam | |
-targetIntervals $reads.targetintervals.list -o $reads.realigned.bam`) | |
#Call variants (takes ~10 minutes to run on a pair of 150M fastq.gz file ) | |
run(`java -jar $GATK -T HaplotypeCaller -R $ref.fasta -I $reads.realigned.bam | |
-ploidy 1 -o $reads.vcf`) | |
end | |
function call_variants_directory(dir, ref) | |
#Loop over call_variants for multiple strain reads in a directory | |
prepare_reference(joinpath(dir,ref)) | |
#extract the first part before the R1 or R2 and remove duplicates | |
fastq_files = glob("*.fastq.gz", dir) | |
strains = map(x -> match(r"(AZPAE\d{5}.*)_R[12]_001.fastq.gz", x).captures[1], fastq_files) |> unique | |
@parallel for strain in strains | |
call_variants(joinpath(dir,ref), joinpath(dir,strain)) | |
end | |
end | |
function gatk_variants_table_to_clc(df_gatk::DataFrame) | |
#Helper function to convert a variant table from GATK to CLC style | |
df = DataFrame(Ref_Pos_=Int[], Type=AbstractString[], Length=Int[], Reference=AbstractString[], Allele=AbstractString[]) | |
for idx = 1:size(df_gatk,1) | |
#Check for contiguous SNV to turn into MNV | |
if (idx > 1) && ((df[end, :Type] == "SNV") || (df[end, :Type] == "MNV")) && | |
(df_gatk[idx, :POS] === df[end,:Ref_Pos_]+df[end,:Length]) && | |
(length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) === 1) | |
df[end, :Type] = "MNV" | |
df[end, :Length] += 1 | |
df[end, :Reference] *= df_gatk[idx, :REF] | |
df[end, :Allele] *= df_gatk[idx, :ALT] | |
#Look for insertion | |
elseif (length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) > 1) | |
push!(df, [df_gatk[idx, :POS]+1, "Insertion", length(df_gatk[idx, :ALT]) - 1, "-", df_gatk[idx, :ALT][2:end]] ) | |
#Look for deletion | |
elseif (length(df_gatk[idx, :REF]) > 1) && (length(df_gatk[idx, :ALT]) === 1) | |
push!(df, [df_gatk[idx, :POS]+1, "Deletion", length(df_gatk[idx, :REF]) - 1, df_gatk[idx, :REF][2:end], "-"] ) | |
#Otherwise push on the SNP | |
else | |
push!(df, [df_gatk[idx, :POS], "SNV", 1, df_gatk[idx, :REF], df_gatk[idx, :ALT]]) | |
end | |
end | |
return df | |
end | |
function consolidate_variant_calls(dir) | |
#consolidate variant calls into a single dataframe for comparative analysis | |
#Grab all the variant call files and convert to tsv tables | |
vcf_files = glob(joinpath(dir, "*.vcf")) | |
ref_file = joinpath(dir, "PAO1_reference.fasta") | |
for f in vcf_files | |
base_name = splitext(f)[1] | |
run(`java -jar $GATK -T VariantsToTable -R $ref_file -V $f | |
-F POS -F REF -F ALT -F QUAL -o $base_name.variants.tsv`) | |
end | |
#Load into data frames, convert to CLC style | |
dfs = [] | |
tsv_files = glob(joinpath(dir, "*.variants.tsv")) | |
for f in tsv_files | |
df = readtable(f) | |
strain_name = match(r"(AZPAE\d{5})", f).captures[1] | |
df = gatk_variants_table_to_clc(df) | |
rename!(df, :Allele, Symbol(strain_name)) | |
# writetable(f[1:end-3] * "clc.tsv", df) | |
push!(dfs, df) | |
end | |
#join to a single data frame | |
main_df = dfs[1] | |
for ct = 2:length(dfs) | |
# main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length, :Reference], kind=:outer) | |
#work around [this FIXME](https://github.com/JuliaStats/DataFrames.jl/blob/039432c62a7d01285e93f91a2363eb5caa86036c/src/abstractdataframe/join.jl#L97) | |
main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length], kind=:outer) | |
#check the references are the same | |
for ct = 1:size(main_df,1) | |
if !isna(main_df[ct, :Reference]) && !isna(main_df[ct, :Reference_1]) | |
if(((main_df[ct,:Reference] != main_df[ct,:Reference_1]))) | |
error("Rows don't match at $ct") | |
end | |
end | |
if isna(main_df[ct, :Reference]) | |
main_df[ct, :Reference] = main_df[ct, :Reference_1] | |
end | |
end | |
delete!(main_df, :Reference_1) | |
end | |
sort!(main_df, cols=:Ref_Pos_) | |
writetable(joinpath(dir, "variant_summary.tsv"), main_df) | |
return main_df | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment