Skip to content

Instantly share code, notes, and snippets.

@cth
Created April 12, 2016 13:42
Show Gist options
  • Save cth/1a2f979a49bd1a67dfce27911a400606 to your computer and use it in GitHub Desktop.
Save cth/1a2f979a49bd1a67dfce27911a400606 to your computer and use it in GitHub Desktop.
#!/home/fng514/bin/julia
#$ -S /home/fng514/bin/julia
#$ -cwd
# Christian Theil Have, 2016.
#import GZip
function process_vcf_line(outfile,line, dpmin, gqmin)
fields = split(line)
# Find the index of genotype quality fields
#println(string(":>", fields[1:9]))
gtidx = first(find(x -> x=="GT", split(fields[9],':')))
gqidx = first(find(x -> x=="GQ", split(fields[9],':')))
dpidx = first(find(x -> x=="DP", split(fields[9],':')))
removed = 0
i = 1
field_idx = 10
for field in fields[10:end]
field_values = split(field,':')
gt = field_values[gtidx]
gq = field_values[gqidx]
dp = field_values[dpidx]
if gt != "./."
if (parse(Int,dp) < dpmin) || (gq != "." && parse(Int,gq) < gqmin)
field_values[gtidx] = "./."
removed += 1
end
end
fields[field_idx] = join(field_values,":")
field_idx += 1
end
#println(join(fields,"\t"))
write(outfile,string(join(fields,"\t"), "\n"))
return [fields[1], fields[2], fields[3], removed]
end
function process_vcf_line(outfile,line, dpmin)
fields = split(line)
# Find the index of genotype quality fields
gtidx = first(find(x -> x=="GT", split(fields[9],':')))
dpidx = first(find(x -> x=="DP", split(fields[9],':')))
removed = 0
i = 1
field_idx = 10
for field in fields[10:end]
field_values = split(field,':')
gt = field_values[gtidx]
dp = field_values[dpidx]
if gt != "./."
if ((try parse(Int,dp) catch 0 end) < dpmin )
field_values[gtidx] = "./."
removed += 1
end
end
fields[field_idx] = join(field_values,":")
field_idx += 1
end
#println(join(fields,"\t"))
write(outfile,string(join(fields,"\t"), "\n"))
return [fields[1], fields[2], fields[3], removed]
end
DP_threshold=20
GQ_threshold=20
outvcf = open(ARGS[2], "w")
bad_sites_file = open(string(ARGS[1], ".bad_sites.vcf"), "w")
open(ARGS[1]) do vcf
lines_read = 0
line_refs = []
removed_by_variant = []
for line in eachline(vcf)
if ismatch(r"^chr", line)
try
push!(removed_by_variant,process_vcf_line(outvcf, line, DP_threshold, GQ_threshold))
catch
push!(removed_by_variant,process_vcf_line(outvcf, line, DP_threshold))
write(bad_sites_file, line)
end
else
write(bad_sites_file, line)
write(outvcf,line)
end
lines_read += 1
if lines_read % 10 == 0
println(lines_read)
end
end
open(string(ARGS[2], "_summary_dpgq"), "w") do outfile
for v in removed_by_variant
write(outfile,string(join(v,"\t"),"\n"))
end
end
end
close(bad_sites_file)
close(outvcf)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment