Created
September 29, 2016 13:12
-
-
Save cth/2f072980fb66ccd086ffa305286bfbf2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using DataFrames | |
parse_identifier(x) = parse(Int32,x[4:end]) | |
function read_snps1(file) | |
snps = Set{Int32}() | |
open(file) do f | |
for l in eachline(f) | |
push!(snps,parse_identifier(chomp(l))) | |
end | |
end | |
return snps | |
end | |
@time gwas_snps = read_snps1("workdir/snpsonly.list") | |
# Should be possible to use a flyweight pattern to reduce memory for this one | |
annotations=Dict{Int32,Array{Symbol,1}}() | |
annotation_groups = Set{Symbol}() | |
sizehint!(annotations, 6000000) | |
push_tag(id,tag) = (push!(annotations[id],tag) ; push!(annotation_groups,tag)) | |
@time open("workdir/annotations_sv_5pct.tsv") do f | |
for l in eachline(f) | |
fields = split(chomp(l)) | |
svid = parse_identifier(fields[1]) | |
annotations[svid] = Array{Symbol,1}() | |
push_tag(svid,Symbol(fields[2])) | |
if (fields[4] != "NA") | |
push_tag(svid,Symbol(fields[4])) | |
end | |
length = parse(Int,fields[3]) | |
if length > 50 | |
push_tag(svid,:large) | |
elseif length > 10 | |
push_tag(svid,:medium) | |
elseif length <= 10 | |
push_tag(svid,:small) | |
end | |
end | |
end | |
function max_sv_ld(ldfile, snps, annot, annot_groups, ldmeasure, steps) | |
println("max_sv_ld") | |
highest_ld = Dict{Int32,Float32}() | |
ld_range = (0:steps)/steps | |
round_ld(ld,steps) = round(steps*ld) / steps | |
lines_processed = 0 | |
open(ldfile) do f | |
for line in eachline(f) | |
fields = split(line) | |
lines_processed = lines_processed + 1 | |
if lines_processed % 1000000 == 0 | |
println(string("lines processed:", lines_processed, " #SV=", length(highest_ld))) | |
end | |
if (fields[1] == "CHR_A") | |
continue | |
end | |
snp,sv = parse_identifier(fields[3]), parse_identifier(fields[6]) | |
if ldmeasure == :r2 | |
ld = round_ld(parse(Float32,fields[7]),steps) | |
elseif ldmeasure == :dprime | |
ld = round_ld(parse(Float32,fields[8]),steps) | |
end | |
if haskey(annot,snp) | |
println("Houston - we have a problem!") | |
end | |
if snp in snps && haskey(annot,sv) # This operation is insanly expensive!! | |
if haskey(highest_ld,sv) | |
if highest_ld[sv] > ld | |
highest_ld[sv] = ld | |
end | |
else | |
highest_ld[sv] = ld | |
end | |
end | |
end | |
end | |
highest_ld | |
end | |
function make_cumulative_table(maxld,annot, annot_groups, steps) | |
println("make_cumulative_table") | |
# Create a dataframe with columns for each specified SV group | |
# and rows for each LD interval in specifed range, s.t., | |
# each cell hold the number of times that we observed an | |
# the number of observation of higest LD between (column-type) SV and a SNP. | |
ldsum = DataFrame( ld = (0:steps)/steps ) | |
for group in annot_groups | |
ldsum[group] = rep(0,steps+1) | |
end | |
for kv in maxld | |
# determine groups | |
sv, ld = kv | |
for group in annot[sv] | |
ldsum[1+Integer(round(steps*ld)),group] += 1 | |
end | |
end | |
# Down-propagate counts for each range to make table cumulative | |
for group in annot_groups | |
cumsum = 0 | |
for ld_idx in reverse(1:steps) | |
cumsum = first(ldsum[ld_idx,group]) + first(ldsum[ld_idx+1,group]) | |
ldsum[ld_idx,group] = cumsum | |
end | |
end | |
ldsum | |
end | |
function cumulative_ld_table(ldfile,snps,annot,annot_groups,ldmeasure::Symbol,steps) | |
@time maxld = max_sv_ld(ldfile, snps, annot, annot_groups, ldmeasure, steps) | |
@time make_cumulative_table(maxld, annot, annot_groups,steps) | |
end | |
writetable(ARGS[3], cumulative_ld_table(ARGS[1],gwas_snps,annotations, annotation_groups,Symbol(ARGS[2]), 100)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment