Skip to content

Instantly share code, notes, and snippets.

@cth
Created January 19, 2016 14:06
Show Gist options
  • Save cth/93cf45e3b48c72d0cdca to your computer and use it in GitHub Desktop.
Save cth/93cf45e3b48c72d0cdca to your computer and use it in GitHub Desktop.
#!/home/fng514/bin/julia
#$ -S /home/fng514/bin/julia
#$ -cwd
#$ -l pe smp 20
# Julia script to to read VCF file and calculate quality and depth statistics (in parallel)
# Christian Theil Have, 2016.
import GZip
println(string("Reading ",ARGS[1]))
@everywhere function process_vcf_line(line)
fields = split(line)
# Find the index of genotype quality fields
gqidx_arr = find(x -> x=="GQ", split(fields[9],':'))
qualities = Array{Int64,1}()
depths = Array{Int64,1}()
NAs = 0
if (length(gqidx_arr) == 1)
gqidx = first(gqidx_arr)
dpidx = first(find(x -> x=="DP", split(fields[9],':')))
for field in fields[10:end]
field_values = split(field,':')
gq = field_values[gqidx]
dp = field_values[dpidx]
if (gq == ".")
NAs += 1
else
push!(qualities,parse(Int64,gq))
end
if (dp != ".")
push!(depths,parse(Int64,dp))
end
end
Nullable([
fields[1], fields[2], fields[3],
reduce(min,qualities),reduce(max,qualities),median(qualities), length(qualities),
reduce(min,depths), reduce(max,depths), median(depths), std(depths)
])
else
null
end
end
nonnull(x) = !isnull(x)
GZip.open(ARGS[1]) do vcf
lines_read = 0
line_refs = []
for line in eachline(vcf)
lines_read = lines_read + 1
if (lines_read % 1000 == 0)
println(string(lines_read, " lines read"))
end
if !ismatch(r"^chr", line)
continue
end
push!(line_refs,@spawn process_vcf_line(line))
end
open("quality_depth_summary.tsv","w") do outfile
for i = line_refs
try write(outfile,join(get(fetch(i)),"\t"), "\n") end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment