Created
February 22, 2016 09:00
-
-
Save cth/f8d53fa026027fbb4836 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/home/fng514/bin/julia | |
#$ -S /home/fng514/bin/julia | |
#$ -cwd | |
# | |
# Split multi allelic sites: | |
# 1. Split multi-allelic sites into multiple VCF lines, s.t., a) each line has one unique alternative allele, and b) if person has an other alternative allele than the one specified for the line in question, then that particular genotype is coded as missing in that line. | |
# 2. Run bi-allelic hardy weinberg test for each of these lines. | |
# | |
# Christian Theil Have, 2016. | |
function split_multiallelic(outvcf,line) | |
fields = split(chomp(line)) | |
alt_alleles = split(fields[5],",") | |
split_lines = Array{ASCIIString}(length(alt_alleles),length(fields)-9) | |
gtidx = first(find(x -> x=="GT", split(fields[9],':'))) | |
for field_idx in 10:length(fields) | |
field = fields[field_idx] | |
field_values = split(field,':') | |
gt = field_values[gtidx] | |
rmatch = match(r"(.*)/(.*)", gt) | |
for i in 1:length(alt_alleles) | |
if rmatch[2] == "0" | |
split_lines[i,field_idx-9] = join(field_values,":") | |
elseif rmatch[2] == alt_alleles[i] | |
field_values_copy = copy(field_values) | |
field_values_copy[gtidx] = rmatch[1]==rmatch[2] ? "1/1" : "0/1" | |
split_lines[i,field_idx-9] = join(field_values_copy,":") | |
else | |
field_values_copy = copy(field_values) | |
field_values_copy[gtidx] = "./." | |
split_lines[i,field_idx-9] = join(field_values_copy,":") | |
end | |
end | |
end | |
# Construct each line from the split_lines array | |
for i in 1:length(alt_alleles) | |
info_part = copy(fields[1:9]) | |
info_part[5] = alt_alleles[i] | |
write(outvcf,string(join(info_part,'\t'),"\t", join(split_lines[i,:], "\t"), "\n")) | |
end | |
end | |
if (length(ARGS) != 2) || (ARGS[1] == ARGS[2]) | |
throw("usage: split_multiallelic in.vcf out.vcf") | |
end | |
open(ARGS[1]) do vcf | |
open(ARGS[2],"w") do outvcf | |
lines_read = 0 | |
line_refs = [] | |
removed_by_variant = [] | |
for line in eachline(vcf) | |
if ismatch(r"^chr", line) | |
if length(line) > 1200 | |
fields = split(line[1:1200], "\t") | |
else | |
fields = split(line[1:end], "\t") | |
end | |
if ismatch(r",", fields[5]) | |
split_multiallelic(outvcf,line) | |
else | |
write(outvcf,line) | |
end | |
else | |
write(outvcf,line) | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment