Created
January 22, 2016 13:37
-
-
Save cth/9374a6c03a63eac7a23f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/home/fng514/bin/julia | |
#$ -S /home/fng514/bin/julia | |
#$ -cwd | |
#$ -pe smp 20 | |
# Julia script to read a VCF file and calculate tstv by stepwise GQ and DP tresholds | |
# Christian Theil Have, 2016. | |
import GZip | |
println(string("Reading ",ARGS[1])) | |
@everywhere @enum TSTV transition=1 transversion=2 neither=3 | |
@everywhere gqmin, gqmax = 10,99 | |
@everywhere dpmin, dpmax = 5,150 | |
@everywhere type TsTvStats | |
gq_transitions::Array{Int,1} | |
gq_transversions::Array{Int,1} | |
dp_transitions::Array{Int,1} | |
dp_transversions::Array{Int,1} | |
end | |
@everywhere function transition_or_transversion(allele1, allele2) | |
nucleotides = Set{AbstractString}(["A","G","C","T"]) | |
if( (allele1 == "A" && allele2 == "G") || | |
(allele1 == "G" && allele2 == "A") || | |
(allele1 == "C" && allele2 == "T") || | |
(allele1 == "T" && allele2 == "C")) | |
1 | |
#TSTV::transition | |
elseif (in(allele1,nucleotides) && in(allele2,nucleotides)) | |
2 | |
#TSTV::transversion | |
else | |
3 | |
#TSTV::neither | |
end | |
end | |
@everywhere function process_vcf_line(line) | |
stats = TsTvStats( | |
zeros(Int,length(gqmin:gqmax)),zeros(Int,length(gqmin:gqmax)), | |
zeros(Int,length(dpmin:dpmax)),zeros(Int,length(dpmin:dpmax))) | |
if !ismatch(r"^chr", line) | |
return stats | |
end | |
fields = split(line) | |
println(join(fields[1:4],"\t")) | |
# Find the index of genotype, depth and quality fields | |
gtidx_arr = find(x -> x=="GT", split(fields[9],':')) | |
gqidx_arr = find(x -> x=="GQ", split(fields[9],':')) | |
dpidx_arr = find(x -> x=="DP", split(fields[9],':')) | |
ts_or_tv = transition_or_transversion(fields[4], fields[5]) | |
if ts_or_tv == 1 #TSTV::transition | |
gq_tstv = stats.gq_transitions | |
dp_tstv = stats.dp_transitions | |
elseif ts_or_tv == 2 # TSTV::transversion | |
gq_tstv = stats.gq_transversions | |
dp_tstv = stats.dp_transversions | |
end | |
if (length(gqidx_arr) == 1) && ts_or_tv != 3 | |
for field in fields[10:end] | |
field_values = split(field,':') | |
gt = field_values[first(gtidx_arr)] | |
try | |
gq = parse(Int,field_values[first(gqidx_arr)]) | |
# Cap gqval at gqmax | |
if (gq >= gqmax) gq = gqmax end | |
if (gt == "0/1" && gq >= gqmin) | |
#println("here") | |
gq_tstv[1+gq-gqmin] += 1 | |
elseif (gt == "1/1" && gq >= gqmin) | |
gq_tstv[1+gq-gqmin] += 2 | |
end | |
end | |
try | |
dp = parse(Int,field_values[first(dpidx_arr)]) | |
# Cap dpval at dpmax | |
if (dp >= dpmax) dp = dpmax end | |
if (gt == "0/1" && dp >= dpmin) | |
dp_tstv[1+dp-dpmin] += 1 | |
elseif (gt == "1/1" && dp >= dpmin) | |
dp_tstv[1+dp-dpmin] += 2 | |
end | |
end | |
end | |
# down-propagate counts | |
acc=0 | |
for i in 1+reverse(gqmin:gqmax)-gqmin | |
gq_tstv[i] += acc | |
acc = gq_tstv[i] | |
end | |
acc=0 | |
for i in 1+reverse(dpmin:dpmax)-dpmin | |
dp_tstv[i] += acc | |
acc = dp_tstv[i] | |
end | |
end | |
return stats | |
end | |
function reduce_tstv_stats(stats1::TsTvStats, stats2::TsTvStats) | |
TsTvStats( | |
stats1.gq_transitions + stats2.gq_transitions, | |
stats1.gq_transversions + stats2.gq_transversions, | |
stats1.dp_transitions + stats2.dp_transitions, | |
stats1.dp_transversions + stats2.dp_transversions) | |
end | |
GZip.open(ARGS[1]) do vcf | |
stats = reduce(reduce_tstv_stats, pmap(process_vcf_line,eachline(vcf))) | |
writedlm("tstv_gq.tsv", hcat(gqmin:gqmax, stats.gq_transitions, stats.gq_transversions, stats.gq_transitions ./ stats.gq_transversions)) | |
writedlm("tstv_dp.tsv", hcat(dpmin:dpmax, stats.dp_transitions, stats.dp_transversions, stats.dp_transitions ./ stats.dp_transversions)) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment