cth · September 29, 2016 13:12
diff --git a/fastr.jl b/fastr.jl
 using DataFrames

 parse_identifier(x) = parse(Int32,x[4:end])

 function read_snps1(file)
 	snps = Set{Int32}()
 	open(file) do f
 		 for l in eachline(f)
 			 push!(snps,parse_identifier(chomp(l)))
 		end
 	end
 	return snps
 end
 @time gwas_snps = read_snps1("workdir/snpsonly.list")


 # Should be possible to use a flyweight pattern to reduce memory for this one
 annotations=Dict{Int32,Array{Symbol,1}}()
 annotation_groups = Set{Symbol}() 
 sizehint!(annotations, 6000000)

 push_tag(id,tag) = (push!(annotations[id],tag) ; push!(annotation_groups,tag))


 @time open("workdir/annotations_sv_5pct.tsv") do f
 	for l in eachline(f)
 		fields = split(chomp(l))	
 		svid = parse_identifier(fields[1])
 		annotations[svid] = Array{Symbol,1}()
 		push_tag(svid,Symbol(fields[2]))
 		if (fields[4] != "NA")
 			push_tag(svid,Symbol(fields[4]))
 		end
 		length = parse(Int,fields[3])
 		if length > 50
 			push_tag(svid,:large)
 		elseif length > 10
 			push_tag(svid,:medium)
 		elseif length <= 10
 			push_tag(svid,:small)
 		end
 	end
 end

 function max_sv_ld(ldfile, snps, annot, annot_groups, ldmeasure, steps)
 	println("max_sv_ld")
 	highest_ld = Dict{Int32,Float32}()
 	ld_range = (0:steps)/steps
 	round_ld(ld,steps) = round(steps*ld) / steps
 	lines_processed = 0
 	open(ldfile) do f
 		for line in eachline(f)
 			fields = split(line)
 			lines_processed = lines_processed + 1
 			if lines_processed % 1000000 == 0
 				println(string("lines processed:", lines_processed, " #SV=", length(highest_ld)))
 			end
 			if (fields[1] == "CHR_A")
 				continue
 			end
 			snp,sv = parse_identifier(fields[3]), parse_identifier(fields[6])
 			if ldmeasure == :r2
 				ld = round_ld(parse(Float32,fields[7]),steps)
 			elseif ldmeasure == :dprime
 				ld = round_ld(parse(Float32,fields[8]),steps)
 		    	end
 			if haskey(annot,snp)
 				println("Houston - we have a problem!")
 			end

 			if snp in snps && haskey(annot,sv)  # This operation is insanly expensive!!
 			      if haskey(highest_ld,sv) 
 				      if highest_ld[sv] > ld
 				     	 highest_ld[sv] = ld
 				      end
 		              else
 				      highest_ld[sv] = ld
 			      end
 			end
 		end
 	 end
 	highest_ld
 end

 function make_cumulative_table(maxld,annot, annot_groups, steps) 
 	println("make_cumulative_table")
 	# Create a dataframe with columns for each specified SV group
 	# and rows for each LD interval in specifed range, s.t., 
 	# each cell hold the number of times that we observed an 
 	# the number of observation of higest LD between (column-type) SV and a SNP. 
 	ldsum = DataFrame( ld = (0:steps)/steps )
 	for group in annot_groups
 		ldsum[group] = rep(0,steps+1)
 	end
 	for kv in maxld 
 		# determine groups
 		sv, ld = kv
 		for group in annot[sv]
 			ldsum[1+Integer(round(steps*ld)),group] += 1
 		end
 	end
 	# Down-propagate counts for each range to make table cumulative 
 	for group in annot_groups
 		cumsum = 0
 		for ld_idx in reverse(1:steps)
 			cumsum = first(ldsum[ld_idx,group]) +  first(ldsum[ld_idx+1,group])
 			ldsum[ld_idx,group] = cumsum
 		end 
 	end
 	ldsum
 end

 function cumulative_ld_table(ldfile,snps,annot,annot_groups,ldmeasure::Symbol,steps)
 	@time maxld =  max_sv_ld(ldfile, snps, annot, annot_groups, ldmeasure, steps)
 	@time make_cumulative_table(maxld, annot, annot_groups,steps)
 end

 writetable(ARGS[3], cumulative_ld_table(ARGS[1],gwas_snps,annotations, annotation_groups,Symbol(ARGS[2]), 100))
	using DataFrames

	parse_identifier(x) = parse(Int32,x[4:end])

	function read_snps1(file)
	snps = Set{Int32}()
	open(file) do f
	for l in eachline(f)
	push!(snps,parse_identifier(chomp(l)))
	end
	end
	return snps
	end
	@time gwas_snps = read_snps1("workdir/snpsonly.list")


	# Should be possible to use a flyweight pattern to reduce memory for this one
	annotations=Dict{Int32,Array{Symbol,1}}()
	annotation_groups = Set{Symbol}()
	sizehint!(annotations, 6000000)

	push_tag(id,tag) = (push!(annotations[id],tag) ; push!(annotation_groups,tag))


	@time open("workdir/annotations_sv_5pct.tsv") do f
	for l in eachline(f)
	fields = split(chomp(l))
	svid = parse_identifier(fields[1])
	annotations[svid] = Array{Symbol,1}()
	push_tag(svid,Symbol(fields[2]))
	if (fields[4] != "NA")
	push_tag(svid,Symbol(fields[4]))
	end
	length = parse(Int,fields[3])
	if length > 50
	push_tag(svid,:large)
	elseif length > 10
	push_tag(svid,:medium)
	elseif length <= 10
	push_tag(svid,:small)
	end
	end
	end

	function max_sv_ld(ldfile, snps, annot, annot_groups, ldmeasure, steps)
	println("max_sv_ld")
	highest_ld = Dict{Int32,Float32}()
	ld_range = (0:steps)/steps
	round_ld(ld,steps) = round(steps*ld) / steps
	lines_processed = 0
	open(ldfile) do f
	for line in eachline(f)
	fields = split(line)
	lines_processed = lines_processed + 1
	if lines_processed % 1000000 == 0
	println(string("lines processed:", lines_processed, " #SV=", length(highest_ld)))
	end
	if (fields[1] == "CHR_A")
	continue
	end
	snp,sv = parse_identifier(fields[3]), parse_identifier(fields[6])
	if ldmeasure == :r2
	ld = round_ld(parse(Float32,fields[7]),steps)
	elseif ldmeasure == :dprime
	ld = round_ld(parse(Float32,fields[8]),steps)
	end
	if haskey(annot,snp)
	println("Houston - we have a problem!")
	end

	if snp in snps && haskey(annot,sv) # This operation is insanly expensive!!
	if haskey(highest_ld,sv)
	if highest_ld[sv] > ld
	highest_ld[sv] = ld
	end
	else
	highest_ld[sv] = ld
	end
	end
	end
	end
	highest_ld
	end

	function make_cumulative_table(maxld,annot, annot_groups, steps)
	println("make_cumulative_table")
	# Create a dataframe with columns for each specified SV group
	# and rows for each LD interval in specifed range, s.t.,
	# each cell hold the number of times that we observed an
	# the number of observation of higest LD between (column-type) SV and a SNP.
	ldsum = DataFrame( ld = (0:steps)/steps )
	for group in annot_groups
	ldsum[group] = rep(0,steps+1)
	end
	for kv in maxld
	# determine groups
	sv, ld = kv
	for group in annot[sv]
	ldsum[1+Integer(round(steps*ld)),group] += 1
	end
	end
	# Down-propagate counts for each range to make table cumulative
	for group in annot_groups
	cumsum = 0
	for ld_idx in reverse(1:steps)
	cumsum = first(ldsum[ld_idx,group]) + first(ldsum[ld_idx+1,group])
	ldsum[ld_idx,group] = cumsum
	end
	end
	ldsum
	end

	function cumulative_ld_table(ldfile,snps,annot,annot_groups,ldmeasure::Symbol,steps)
	@time maxld = max_sv_ld(ldfile, snps, annot, annot_groups, ldmeasure, steps)
	@time make_cumulative_table(maxld, annot, annot_groups,steps)
	end

	writetable(ARGS[3], cumulative_ld_table(ARGS[1],gwas_snps,annotations, annotation_groups,Symbol(ARGS[2]), 100))