caryan · May 30, 2016 00:29
diff --git a/variant_calling.jl b/variant_calling.jl
 using Glob
 using DataFrames

 const PICARD = "/home/cryan/Downloads/picard-tools-2.1.0/picard.jar"
 const GATK = "/home/cryan/Downloads/GATK/GenomeAnalysisTK.jar"

 function prepare_reference(ref)
 	#index the reference for bwa
 	run(`bwa index $ref.fasta`)
 	#sort the reference TODO: does it need to be sorted? .fai file seems minimal
 	run(`samtools faidx $ref.fasta`)
 	#create sequence dictionary (required for IndelRealigner)

 	run(`java -jar $PICARD CreateSequenceDictionary REFERENCE=$ref.fasta OUTPUT=$ref.dict`)
 end

 function call_variants(ref, reads)
 	#align reads to reference and call variants
 	#assumes that reads are in $reads_R1_001.fastq.gz and $reads_R2_001.fastq.gz

 	#align reads using bwa
 	run(pipeline(`bwa mem $ref.fasta $(reads)_R1_001.fastq.gz $(reads)_R2_001.fastq.gz`, "$reads.aln.sam"))

 	#Sort SAM file, add fake ReadGroup IDs (otherwise will fail at GATK RealignerTargetCreator below) and convert to bam
 	strain = match(r"AZPAE\d{5}", reads).match
 	run(`java -jar $PICARD AddOrReplaceReadGroups I=$reads.aln.sam O=$reads.sorted.bam
 			RGLB=kos1 RGPL=illumina RGPU=unit1 RGSM=$strain SORT_ORDER=coordinate`)

 	# mark duplicates
 	run(`java -jar $PICARD MarkDuplicates I=$reads.sorted.bam O=$reads.dedup.bam METRICS_FILE=metrics.txt`)

 	# build bam index (requires bam file sorted in coordinate order)
 	run(`java -jar $PICARD BuildBamIndex I=$reads.dedup.bam`)

 	#Create realignment targets for IndelRealigner below
 	run(`java -jar $GATK -T RealignerTargetCreator -R $ref.fasta -I $reads.dedup.bam -o $reads.targetintervals.list`)

 	# Indel realignment
 	#needs	a reference dictionary (ref.dict for ref.fasta)
 	run(`java -jar $GATK -T IndelRealigner -R $ref.fasta -I $reads.dedup.bam
 			-targetIntervals $reads.targetintervals.list -o $reads.realigned.bam`)

 	#Call variants (takes ~10 minutes to run on a pair of 150M fastq.gz file )
 	run(`java -jar $GATK -T HaplotypeCaller -R $ref.fasta -I $reads.realigned.bam
 			-ploidy 1 -o $reads.vcf`)
 end

 function call_variants_directory(dir, ref)
 	#Loop over call_variants for multiple strain reads in a directory

 	prepare_reference(joinpath(dir,ref))

 	#extract the first part before the R1 or R2 and remove duplicates
 	fastq_files = glob("*.fastq.gz", dir)
 	strains = map(x -> match(r"(AZPAE\d{5}.*)_R[12]_001.fastq.gz", x).captures[1], fastq_files) |> unique

 	@parallel for strain in strains
 		call_variants(joinpath(dir,ref), joinpath(dir,strain))
 	end

 end

 function gatk_variants_table_to_clc(df_gatk::DataFrame)
 		#Helper function to convert a variant table from GATK to CLC style
 		df = DataFrame(Ref_Pos_=Int[], Type=AbstractString[], Length=Int[], Reference=AbstractString[], Allele=AbstractString[])

 		for idx = 1:size(df_gatk,1)
 				#Check for contiguous SNV to turn into MNV
 				if (idx > 1) && ((df[end, :Type] == "SNV") || (df[end, :Type] == "MNV")) &&
 						(df_gatk[idx, :POS] === df[end,:Ref_Pos_]+df[end,:Length]) &&
 						(length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) === 1)
 						df[end, :Type] = "MNV"
 						df[end, :Length] += 1
 						df[end, :Reference] *= df_gatk[idx, :REF]
 						df[end, :Allele] *= df_gatk[idx, :ALT]

 				#Look for insertion
 				elseif (length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) > 1)
 						push!(df, [df_gatk[idx, :POS]+1, "Insertion", length(df_gatk[idx, :ALT]) - 1, "-", df_gatk[idx, :ALT][2:end]] )

 				#Look for deletion
 				elseif (length(df_gatk[idx, :REF]) > 1) && (length(df_gatk[idx, :ALT]) === 1)
 						push!(df, [df_gatk[idx, :POS]+1, "Deletion", length(df_gatk[idx, :REF]) - 1, df_gatk[idx, :REF][2:end], "-"] )

 				#Otherwise push on the SNP
 				else
 						push!(df, [df_gatk[idx, :POS], "SNV", 1, df_gatk[idx, :REF], df_gatk[idx, :ALT]])
 				end

 		end
 		return df
 end

 function consolidate_variant_calls(dir)
 	#consolidate variant calls into a single dataframe for comparative analysis

 	#Grab all the variant call files and convert to tsv tables
 	vcf_files = glob(joinpath(dir, "*.vcf"))
 	ref_file = joinpath(dir, "PAO1_reference.fasta")
 	for f in vcf_files
 		base_name = splitext(f)[1]
 		run(`java -jar $GATK -T VariantsToTable -R $ref_file -V $f
 			-F POS -F REF -F ALT -F QUAL -o $base_name.variants.tsv`)
 	end

 	#Load into data frames, convert to CLC style
 	dfs = []
 	tsv_files = glob(joinpath(dir, "*.variants.tsv"))
 	for f in tsv_files
 		df = readtable(f)
 		strain_name = match(r"(AZPAE\d{5})", f).captures[1]
 		df = gatk_variants_table_to_clc(df)
 		rename!(df, :Allele, Symbol(strain_name))
 		# writetable(f[1:end-3] * "clc.tsv", df)
 		push!(dfs, df)
 	end

 	#join to a single data frame
 	main_df = dfs[1]
 	for ct = 2:length(dfs)
 		# main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length, :Reference], kind=:outer)
 		#work around [this FIXME](https://github.com/JuliaStats/DataFrames.jl/blob/039432c62a7d01285e93f91a2363eb5caa86036c/src/abstractdataframe/join.jl#L97)
 		main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length], kind=:outer)
 		#check the references are the same
 		for ct = 1:size(main_df,1)
 			if !isna(main_df[ct, :Reference]) && !isna(main_df[ct, :Reference_1])
 				if(((main_df[ct,:Reference] != main_df[ct,:Reference_1])))
 					error("Rows don't match at $ct")
 				end
 			end
 			if isna(main_df[ct, :Reference])
 				main_df[ct, :Reference] = main_df[ct, :Reference_1]
 			end
 		end
 		delete!(main_df, :Reference_1)
 	end
 	sort!(main_df, cols=:Ref_Pos_)

 	writetable(joinpath(dir, "variant_summary.tsv"), main_df)

 	return main_df
 end
	using Glob
	using DataFrames

	const PICARD = "/home/cryan/Downloads/picard-tools-2.1.0/picard.jar"
	const GATK = "/home/cryan/Downloads/GATK/GenomeAnalysisTK.jar"

	function prepare_reference(ref)
	#index the reference for bwa
	run(`bwa index $ref.fasta`)
	#sort the reference TODO: does it need to be sorted? .fai file seems minimal
	run(`samtools faidx $ref.fasta`)
	#create sequence dictionary (required for IndelRealigner)

	run(`java -jar $PICARD CreateSequenceDictionary REFERENCE=$ref.fasta OUTPUT=$ref.dict`)
	end

	function call_variants(ref, reads)
	#align reads to reference and call variants
	#assumes that reads are in $reads_R1_001.fastq.gz and $reads_R2_001.fastq.gz

	#align reads using bwa
	run(pipeline(`bwa mem $ref.fasta $(reads)_R1_001.fastq.gz $(reads)_R2_001.fastq.gz`, "$reads.aln.sam"))

	#Sort SAM file, add fake ReadGroup IDs (otherwise will fail at GATK RealignerTargetCreator below) and convert to bam
	strain = match(r"AZPAE\d{5}", reads).match
	run(`java -jar $PICARD AddOrReplaceReadGroups I=$reads.aln.sam O=$reads.sorted.bam
	RGLB=kos1 RGPL=illumina RGPU=unit1 RGSM=$strain SORT_ORDER=coordinate`)

	# mark duplicates
	run(`java -jar $PICARD MarkDuplicates I=$reads.sorted.bam O=$reads.dedup.bam METRICS_FILE=metrics.txt`)

	# build bam index (requires bam file sorted in coordinate order)
	run(`java -jar $PICARD BuildBamIndex I=$reads.dedup.bam`)

	#Create realignment targets for IndelRealigner below
	run(`java -jar $GATK -T RealignerTargetCreator -R $ref.fasta -I $reads.dedup.bam -o $reads.targetintervals.list`)

	# Indel realignment
	#needs a reference dictionary (ref.dict for ref.fasta)
	run(`java -jar $GATK -T IndelRealigner -R $ref.fasta -I $reads.dedup.bam
	-targetIntervals $reads.targetintervals.list -o $reads.realigned.bam`)

	#Call variants (takes ~10 minutes to run on a pair of 150M fastq.gz file )
	run(`java -jar $GATK -T HaplotypeCaller -R $ref.fasta -I $reads.realigned.bam
	-ploidy 1 -o $reads.vcf`)
	end

	function call_variants_directory(dir, ref)
	#Loop over call_variants for multiple strain reads in a directory

	prepare_reference(joinpath(dir,ref))

	#extract the first part before the R1 or R2 and remove duplicates
	fastq_files = glob("*.fastq.gz", dir)
	strains = map(x -> match(r"(AZPAE\d{5}.*)_R[12]_001.fastq.gz", x).captures[1], fastq_files) \|> unique

	@parallel for strain in strains
	call_variants(joinpath(dir,ref), joinpath(dir,strain))
	end

	end

	function gatk_variants_table_to_clc(df_gatk::DataFrame)
	#Helper function to convert a variant table from GATK to CLC style
	df = DataFrame(Ref_Pos_=Int[], Type=AbstractString[], Length=Int[], Reference=AbstractString[], Allele=AbstractString[])

	for idx = 1:size(df_gatk,1)
	#Check for contiguous SNV to turn into MNV
	if (idx > 1) && ((df[end, :Type] == "SNV") \|\| (df[end, :Type] == "MNV")) &&
	(df_gatk[idx, :POS] === df[end,:Ref_Pos_]+df[end,:Length]) &&
	(length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) === 1)
	df[end, :Type] = "MNV"
	df[end, :Length] += 1
	df[end, :Reference] *= df_gatk[idx, :REF]
	df[end, :Allele] *= df_gatk[idx, :ALT]

	#Look for insertion
	elseif (length(df_gatk[idx, :REF]) === 1) && (length(df_gatk[idx, :ALT]) > 1)
	push!(df, [df_gatk[idx, :POS]+1, "Insertion", length(df_gatk[idx, :ALT]) - 1, "-", df_gatk[idx, :ALT][2:end]] )

	#Look for deletion
	elseif (length(df_gatk[idx, :REF]) > 1) && (length(df_gatk[idx, :ALT]) === 1)
	push!(df, [df_gatk[idx, :POS]+1, "Deletion", length(df_gatk[idx, :REF]) - 1, df_gatk[idx, :REF][2:end], "-"] )

	#Otherwise push on the SNP
	else
	push!(df, [df_gatk[idx, :POS], "SNV", 1, df_gatk[idx, :REF], df_gatk[idx, :ALT]])
	end

	end
	return df
	end

	function consolidate_variant_calls(dir)
	#consolidate variant calls into a single dataframe for comparative analysis

	#Grab all the variant call files and convert to tsv tables
	vcf_files = glob(joinpath(dir, "*.vcf"))
	ref_file = joinpath(dir, "PAO1_reference.fasta")
	for f in vcf_files
	base_name = splitext(f)[1]
	run(`java -jar $GATK -T VariantsToTable -R $ref_file -V $f
	-F POS -F REF -F ALT -F QUAL -o $base_name.variants.tsv`)
	end

	#Load into data frames, convert to CLC style
	dfs = []
	tsv_files = glob(joinpath(dir, "*.variants.tsv"))
	for f in tsv_files
	df = readtable(f)
	strain_name = match(r"(AZPAE\d{5})", f).captures[1]
	df = gatk_variants_table_to_clc(df)
	rename!(df, :Allele, Symbol(strain_name))
	# writetable(f[1:end-3] * "clc.tsv", df)
	push!(dfs, df)
	end

	#join to a single data frame
	main_df = dfs[1]
	for ct = 2:length(dfs)
	# main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length, :Reference], kind=:outer)
	#work around [this FIXME](https://github.com/JuliaStats/DataFrames.jl/blob/039432c62a7d01285e93f91a2363eb5caa86036c/src/abstractdataframe/join.jl#L97)
	main_df = join(main_df, dfs[ct], on=[:Ref_Pos_, :Type, :Length], kind=:outer)
	#check the references are the same
	for ct = 1:size(main_df,1)
	if !isna(main_df[ct, :Reference]) && !isna(main_df[ct, :Reference_1])
	if(((main_df[ct,:Reference] != main_df[ct,:Reference_1])))
	error("Rows don't match at $ct")
	end
	end
	if isna(main_df[ct, :Reference])
	main_df[ct, :Reference] = main_df[ct, :Reference_1]
	end
	end
	delete!(main_df, :Reference_1)
	end
	sort!(main_df, cols=:Ref_Pos_)

	writetable(joinpath(dir, "variant_summary.tsv"), main_df)

	return main_df
	end