Skip to content

Instantly share code, notes, and snippets.

@cth
Created January 22, 2016 13:37
Show Gist options
  • Save cth/9374a6c03a63eac7a23f to your computer and use it in GitHub Desktop.
Save cth/9374a6c03a63eac7a23f to your computer and use it in GitHub Desktop.
#!/home/fng514/bin/julia
#$ -S /home/fng514/bin/julia
#$ -cwd
#$ -pe smp 20
# Julia script to read a VCF file and calculate tstv by stepwise GQ and DP tresholds
# Christian Theil Have, 2016.
import GZip
println(string("Reading ",ARGS[1]))
@everywhere @enum TSTV transition=1 transversion=2 neither=3
@everywhere gqmin, gqmax = 10,99
@everywhere dpmin, dpmax = 5,150
@everywhere type TsTvStats
gq_transitions::Array{Int,1}
gq_transversions::Array{Int,1}
dp_transitions::Array{Int,1}
dp_transversions::Array{Int,1}
end
@everywhere function transition_or_transversion(allele1, allele2)
nucleotides = Set{AbstractString}(["A","G","C","T"])
if( (allele1 == "A" && allele2 == "G") ||
(allele1 == "G" && allele2 == "A") ||
(allele1 == "C" && allele2 == "T") ||
(allele1 == "T" && allele2 == "C"))
1
#TSTV::transition
elseif (in(allele1,nucleotides) && in(allele2,nucleotides))
2
#TSTV::transversion
else
3
#TSTV::neither
end
end
@everywhere function process_vcf_line(line)
stats = TsTvStats(
zeros(Int,length(gqmin:gqmax)),zeros(Int,length(gqmin:gqmax)),
zeros(Int,length(dpmin:dpmax)),zeros(Int,length(dpmin:dpmax)))
if !ismatch(r"^chr", line)
return stats
end
fields = split(line)
println(join(fields[1:4],"\t"))
# Find the index of genotype, depth and quality fields
gtidx_arr = find(x -> x=="GT", split(fields[9],':'))
gqidx_arr = find(x -> x=="GQ", split(fields[9],':'))
dpidx_arr = find(x -> x=="DP", split(fields[9],':'))
ts_or_tv = transition_or_transversion(fields[4], fields[5])
if ts_or_tv == 1 #TSTV::transition
gq_tstv = stats.gq_transitions
dp_tstv = stats.dp_transitions
elseif ts_or_tv == 2 # TSTV::transversion
gq_tstv = stats.gq_transversions
dp_tstv = stats.dp_transversions
end
if (length(gqidx_arr) == 1) && ts_or_tv != 3
for field in fields[10:end]
field_values = split(field,':')
gt = field_values[first(gtidx_arr)]
try
gq = parse(Int,field_values[first(gqidx_arr)])
# Cap gqval at gqmax
if (gq >= gqmax) gq = gqmax end
if (gt == "0/1" && gq >= gqmin)
#println("here")
gq_tstv[1+gq-gqmin] += 1
elseif (gt == "1/1" && gq >= gqmin)
gq_tstv[1+gq-gqmin] += 2
end
end
try
dp = parse(Int,field_values[first(dpidx_arr)])
# Cap dpval at dpmax
if (dp >= dpmax) dp = dpmax end
if (gt == "0/1" && dp >= dpmin)
dp_tstv[1+dp-dpmin] += 1
elseif (gt == "1/1" && dp >= dpmin)
dp_tstv[1+dp-dpmin] += 2
end
end
end
# down-propagate counts
acc=0
for i in 1+reverse(gqmin:gqmax)-gqmin
gq_tstv[i] += acc
acc = gq_tstv[i]
end
acc=0
for i in 1+reverse(dpmin:dpmax)-dpmin
dp_tstv[i] += acc
acc = dp_tstv[i]
end
end
return stats
end
function reduce_tstv_stats(stats1::TsTvStats, stats2::TsTvStats)
TsTvStats(
stats1.gq_transitions + stats2.gq_transitions,
stats1.gq_transversions + stats2.gq_transversions,
stats1.dp_transitions + stats2.dp_transitions,
stats1.dp_transversions + stats2.dp_transversions)
end
GZip.open(ARGS[1]) do vcf
stats = reduce(reduce_tstv_stats, pmap(process_vcf_line,eachline(vcf)))
writedlm("tstv_gq.tsv", hcat(gqmin:gqmax, stats.gq_transitions, stats.gq_transversions, stats.gq_transitions ./ stats.gq_transversions))
writedlm("tstv_dp.tsv", hcat(dpmin:dpmax, stats.dp_transitions, stats.dp_transversions, stats.dp_transitions ./ stats.dp_transversions))
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment