Skip to content

Instantly share code, notes, and snippets.

@cth
Created September 29, 2016 13:12
Show Gist options
  • Save cth/2f072980fb66ccd086ffa305286bfbf2 to your computer and use it in GitHub Desktop.
Save cth/2f072980fb66ccd086ffa305286bfbf2 to your computer and use it in GitHub Desktop.
using DataFrames
parse_identifier(x) = parse(Int32,x[4:end])
function read_snps1(file)
snps = Set{Int32}()
open(file) do f
for l in eachline(f)
push!(snps,parse_identifier(chomp(l)))
end
end
return snps
end
@time gwas_snps = read_snps1("workdir/snpsonly.list")
# Should be possible to use a flyweight pattern to reduce memory for this one
annotations=Dict{Int32,Array{Symbol,1}}()
annotation_groups = Set{Symbol}()
sizehint!(annotations, 6000000)
push_tag(id,tag) = (push!(annotations[id],tag) ; push!(annotation_groups,tag))
@time open("workdir/annotations_sv_5pct.tsv") do f
for l in eachline(f)
fields = split(chomp(l))
svid = parse_identifier(fields[1])
annotations[svid] = Array{Symbol,1}()
push_tag(svid,Symbol(fields[2]))
if (fields[4] != "NA")
push_tag(svid,Symbol(fields[4]))
end
length = parse(Int,fields[3])
if length > 50
push_tag(svid,:large)
elseif length > 10
push_tag(svid,:medium)
elseif length <= 10
push_tag(svid,:small)
end
end
end
function max_sv_ld(ldfile, snps, annot, annot_groups, ldmeasure, steps)
println("max_sv_ld")
highest_ld = Dict{Int32,Float32}()
ld_range = (0:steps)/steps
round_ld(ld,steps) = round(steps*ld) / steps
lines_processed = 0
open(ldfile) do f
for line in eachline(f)
fields = split(line)
lines_processed = lines_processed + 1
if lines_processed % 1000000 == 0
println(string("lines processed:", lines_processed, " #SV=", length(highest_ld)))
end
if (fields[1] == "CHR_A")
continue
end
snp,sv = parse_identifier(fields[3]), parse_identifier(fields[6])
if ldmeasure == :r2
ld = round_ld(parse(Float32,fields[7]),steps)
elseif ldmeasure == :dprime
ld = round_ld(parse(Float32,fields[8]),steps)
end
if haskey(annot,snp)
println("Houston - we have a problem!")
end
if snp in snps && haskey(annot,sv) # This operation is insanly expensive!!
if haskey(highest_ld,sv)
if highest_ld[sv] > ld
highest_ld[sv] = ld
end
else
highest_ld[sv] = ld
end
end
end
end
highest_ld
end
function make_cumulative_table(maxld,annot, annot_groups, steps)
println("make_cumulative_table")
# Create a dataframe with columns for each specified SV group
# and rows for each LD interval in specifed range, s.t.,
# each cell hold the number of times that we observed an
# the number of observation of higest LD between (column-type) SV and a SNP.
ldsum = DataFrame( ld = (0:steps)/steps )
for group in annot_groups
ldsum[group] = rep(0,steps+1)
end
for kv in maxld
# determine groups
sv, ld = kv
for group in annot[sv]
ldsum[1+Integer(round(steps*ld)),group] += 1
end
end
# Down-propagate counts for each range to make table cumulative
for group in annot_groups
cumsum = 0
for ld_idx in reverse(1:steps)
cumsum = first(ldsum[ld_idx,group]) + first(ldsum[ld_idx+1,group])
ldsum[ld_idx,group] = cumsum
end
end
ldsum
end
function cumulative_ld_table(ldfile,snps,annot,annot_groups,ldmeasure::Symbol,steps)
@time maxld = max_sv_ld(ldfile, snps, annot, annot_groups, ldmeasure, steps)
@time make_cumulative_table(maxld, annot, annot_groups,steps)
end
writetable(ARGS[3], cumulative_ld_table(ARGS[1],gwas_snps,annotations, annotation_groups,Symbol(ARGS[2]), 100))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment