cth · August 20, 2015 08:35
diff --git a/Rakefile b/Rakefile
 require 'rinruby'
 require 'bio-samtools'
 require 'rake'

 plink_stems = {
 	:lucamp => "/eva/brutus/J8/data/deepExom/burden5/clean/lrt24qibin7R2BeagleV2",
 #	:metabochip => "/eva/brutus/J8/data/metaboChip/clean/metaboChipAll",
 	:exomchip => "/eva/brutus/J8/data/exomChip/clean/exomChipAll"
 }

 class String
 	def chr_i 
 	
 		i=self.gsub("chr","")
 		if i=="X" then
 			23
 		elsif i.to_i > 0 
 			i.to_i
 		else 
 			1000	
 		end	
 	end
 end

 def vcffile(mapper,particid)
 	return nil if mapper[particid].nil?
 	file="/eva/data/lpj264_temporary/gvcf_controls/" + mapper[particid] + ".g.vcf"

 	if File.exists?(file)
 		file
 	else
 		nil
 	end
 end

 def vcf_binary_search(file, chr, pos, proximity=1000)
 	gvcf=File.open(file)
 	gvcf.seek(0, IO::SEEK_END)
 	gvcf_size=gvcf.pos

 	index=gvcf_size / 2
 	change = index / 2
 	previous_index = 0

 	geno = nil 

 	loop do
 		#throw "error" if previous_index == index
 		previous_index = index

 		gvcf.seek(index, IO::SEEK_SET)
 		# We might be in the middle of a line skip to start of next line
 		gvcf.gets
 		line = gvcf.gets
 		vcfline=Bio::DB::Vcf.new(line)

 		if previous_index == index or (chr == vcfline.chrom and (pos - vcfline.pos).abs < proximity)
 			# seek backwards if we went slightly to long
 			gvcf.seek(index-20000,IO::SEEK_SET) if pos - vcfline.pos < 0
 			gvcf.readline
 			loop do 
 				line = gvcf.readline
 				vcfline=Bio::DB::Vcf.new(line)
 				if vcfline.pos = pos or (vcfline.pos > pos and !vcfline.info["END"].nil? and vcfline.info["END"].to_i <= pos) 
 					if  vcfline.samples["1"]["GT"] == "0/0"
 						geno = 0
 					elsif vcfline.samples["1"]["GT"] == "0/1" or vcfline.samples["1"]["GT"] == "0|1"	
 						geno = 1
 						puts vcfline.inspect
 					elsif vcfline.samples["1"]["GT"] == "1/1" or vcfline.samples["1"]["GT"] == "1|1"
 						geno = 1
 						puts vcfline.inspect
 					end	
 					break
 				end
 			end
 			break
 		elsif chr.chr_i < vcfline.chrom.chr_i then
 			index = index - change
 		elsif chr.chr_i > vcfline.chrom.chr_i then
 			index = index + change
 		elsif pos < vcfline.pos then 
 			index = index - change
 		elsif pos > vcfline.pos then
 			index = index + change
 		end
 		change = change / 2
 	end
 	gvcf.close

 	return geno
 end



 ###############
 # Rake stuff



 task :extract_snps_by_position => %W[ data/snp_ranges.txt data/lucamp_snps_all.ped data/exomchip_snps_all.ped ]

 task :extract_individuals => [ :extract_snps_by_position ] + %W[ data/lucamp_snps_all_inter99.ped data/exomchip_snps_all_inter99.ped ]

 task :rename_snps => [ :extract_individuals ] + %W[ data/exomchip_snps_all_inter99_renamed.ped data/exomchip_snps_all_inter99_renamed.map ]

 # Rule to extract snps by positions 
 plink_stems.each do |stemkey,stem|
 	plink_file = stem + ".bed"
 	new_plink_stem="data/#{stemkey}_snps_all"
 	new_plink_file=new_plink_stem + ".ped"
 	file new_plink_file => plink_file do
 		sh "plink19 --bfile #{stem} --extract data/snp_ranges.txt --range --recode --out #{new_plink_stem}"
 	end

 	old_plink_stem=new_plink_stem
 	new_plink_stem="data/#{stemkey}_snps_all_inter99"
 	new_plink_file=new_plink_stem + ".ped"
 	file new_plink_file => plink_file do
 		sh "plink19 --file #{old_plink_stem} --keep data/keeplist.txt --recode --out #{new_plink_stem}"
 	end
 end

 # Make ranges from position file
 file "data/snp_ranges.txt"  => "position_list.txt" do 
 	sh "cat position_list.txt|tr ':' ' '|awk '{print $1,$2,$2,\"id\"}' > data/snp_ranges.txt"
 end

 file "data/keeplist.txt" => "I99Deltnr.txt" do
 	sh "tail -n+3 I99Deltnr.txt |grep \"^1000\" |awk '{print $1, \"1\" }' > data/keeplist.txt"
 end

 file "data/exomchip_snps_all_inter99_renamed.ped" => "data/exomchip_snps_all_inter99.ped" do 
 	sh "cp data/exomchip_snps_all_inter99.ped data/exomchip_snps_all_inter99_renamed.ped"
 end

 file "data/exomchip_snps_all_inter99_renamed.map" => "data/exomchip_snps_all_inter99.map" do 
 	sh "cat data/exomchip_snps_all_inter99.map|awk '{print $1,$1 \"_\" $4, $3, $4 }' > data/exomchip_snps_all_inter99_renamed.map"
 end

 file "data/merged.ped" => [  :rename_snps ] do
 	sh "plink --file data/exomchip_snps_all_inter99_renamed --merge data/lucamp_snps_all_inter99.ped data/lucamp_snps_all_inter99.map --recode --out data/merged"
 end

 file "data/merged_012.ped" => "data/merged.ped" do
 	sh "plink19 --file data/merged --recode 12 tab --out data/merged_012"
 end 

 file "data/snps_table.tsv" => "data/merged_012.ped" do
 	genotypes = {}
 	snps = []
 	File.open("data/merged_012.map") do |file|
 		file.each do |line|
 			snps << line.split(/\s/)[1].gsub(/(.+)_(.*)/, "chr\\1:\\2")
 		end
 	end

 	File.open("data/merged_012.ped") do |file|
 		file.each do |line|
 			fields = line.split("\t")
 			genotypes[fields[0]] = []
 			6.upto(fields.size-1) do |i|
 				genotypes[fields[0]] << case fields[i]
 				when "2 2"
 					 0
 				when "1 2"
 					 1 
 				when "1 1"
 					2 
 				else
 					"NA"	
 				end	
 			end
 		end
 	end

 	# Write genotype table to file
 	File.open("data/snps_table.tsv", "w") do |file|
 		file.puts "particid\t" + snps.join("\t")
 		genotypes.each do |k,v|
 			file.puts k + "\t" + v.join("\t")
 		end
 	end
 end


 file "data/indels_table.tsv" => [ "data/lucamp_snps_all_inter99.ped", "SampleID_LibraryID.tsv", "DCM_indel_rare.csv" ]  do
 	positions = {}
 	dcm_indel_map = []

 	File.open("DCM_indel_rare.csv") do |file|
 		lineno = 0
 		file.each do |line|
 			lineno = lineno + 1
 			next unless lineno > 1
 			chr, pos = *line.split(",")[0].split(":")
 			if positions[chr].nil? then
 			chr = "chr" + chr
 			#chr = "chr" + chr.gsub("X","23")
 				positions[chr] = [ pos.to_i ]
 			else
 				positions[chr] =  ([ pos.to_i ] + positions[chr]).sort
 			end
 			
 			dcm_indel_map << [ chr, pos.to_i ]
 		end
 	end

 	particid_studyid = {}
 	File.open("SampleID_LibraryID.tsv") do |file|
 		lineno = 0
 		file.each do |line|
 			lineno = lineno + 1
 			next unless lineno > 1
 			fields = line.split(" ")
 			particid_studyid[fields[0]] = fields[1]
 		end
 	end

 	particids = [] 
 	File.open("data/lucamp_snps_all_inter99.ped") do |file|
 		file.each do |line|
 			fields = line.split(" ")
 			particids << fields[0]	
 		end

 	end

 	# Create arary to hold all genotypes
 	genotypes = {} 
 	particids.each do |p|
 		genotypes[p] = []
 		dcm_indel_map.size.times { genotypes[p] << "NA" }
 	end

 	File.open("vcf_files.txt") do |vcflist|
 		particids.each do |p|
 			vcf = vcffile(particid_studyid,p)
 			next if vcf.nil?
 			dcm_indel_map.each_with_index do |chrpos,idx|
 				chr,pos = chrpos 
 				geno=vcf_binary_search(vcf,chr,pos)
 				genotypes[p][idx] = geno unless geno.nil?
 			end
 		end
 	end

 	# Write genotype table to file
 	File.open("data/indels_table.tsv", "w") do |file|
 		file.puts "particid\t" + (dcm_indel_map.collect { |indel| indel.first + ":" + indel.last.to_s }).join("\t")
 		genotypes.each do |k,v|
 			file.puts k + "\t" + v.join("\t")
 		end
 	end
 end

 file "data/merged_table.tsv" => [ "data/snps_table.tsv", "data/indels_table.tsv" ] do
 R.eval <<EOF
 snps=read.table("data/snps_table.tsv", h=T, sep="\t")
 indels=read.table("data/indels_table.tsv", h=T, sep="\t")
 write.table(merge(snps,indels,all=T),"data/merged_table.tsv", row.names=F, col.names=T, quot=F, sep="\\t")
 EOF
 sh "sed -i 's/\\./:/g' data/merged_table.tsv"
 end
	require 'rinruby'
	require 'bio-samtools'
	require 'rake'

	plink_stems = {
	:lucamp => "/eva/brutus/J8/data/deepExom/burden5/clean/lrt24qibin7R2BeagleV2",
	# :metabochip => "/eva/brutus/J8/data/metaboChip/clean/metaboChipAll",
	:exomchip => "/eva/brutus/J8/data/exomChip/clean/exomChipAll"
	}

	class String
	def chr_i

	i=self.gsub("chr","")
	if i=="X" then
	23
	elsif i.to_i > 0
	i.to_i
	else
	1000
	end
	end
	end

	def vcffile(mapper,particid)
	return nil if mapper[particid].nil?
	file="/eva/data/lpj264_temporary/gvcf_controls/" + mapper[particid] + ".g.vcf"

	if File.exists?(file)
	file
	else
	nil
	end
	end

	def vcf_binary_search(file, chr, pos, proximity=1000)
	gvcf=File.open(file)
	gvcf.seek(0, IO::SEEK_END)
	gvcf_size=gvcf.pos

	index=gvcf_size / 2
	change = index / 2
	previous_index = 0

	geno = nil

	loop do
	#throw "error" if previous_index == index
	previous_index = index

	gvcf.seek(index, IO::SEEK_SET)
	# We might be in the middle of a line skip to start of next line
	gvcf.gets
	line = gvcf.gets
	vcfline=Bio::DB::Vcf.new(line)

	if previous_index == index or (chr == vcfline.chrom and (pos - vcfline.pos).abs < proximity)
	# seek backwards if we went slightly to long
	gvcf.seek(index-20000,IO::SEEK_SET) if pos - vcfline.pos < 0
	gvcf.readline
	loop do
	line = gvcf.readline
	vcfline=Bio::DB::Vcf.new(line)
	if vcfline.pos = pos or (vcfline.pos > pos and !vcfline.info["END"].nil? and vcfline.info["END"].to_i <= pos)
	if vcfline.samples["1"]["GT"] == "0/0"
	geno = 0
	elsif vcfline.samples["1"]["GT"] == "0/1" or vcfline.samples["1"]["GT"] == "0\|1"
	geno = 1
	puts vcfline.inspect
	elsif vcfline.samples["1"]["GT"] == "1/1" or vcfline.samples["1"]["GT"] == "1\|1"
	geno = 1
	puts vcfline.inspect
	end
	break
	end
	end
	break
	elsif chr.chr_i < vcfline.chrom.chr_i then
	index = index - change
	elsif chr.chr_i > vcfline.chrom.chr_i then
	index = index + change
	elsif pos < vcfline.pos then
	index = index - change
	elsif pos > vcfline.pos then
	index = index + change
	end
	change = change / 2
	end
	gvcf.close

	return geno
	end



	###############
	# Rake stuff



	task :extract_snps_by_position => %W[ data/snp_ranges.txt data/lucamp_snps_all.ped data/exomchip_snps_all.ped ]

	task :extract_individuals => [ :extract_snps_by_position ] + %W[ data/lucamp_snps_all_inter99.ped data/exomchip_snps_all_inter99.ped ]

	task :rename_snps => [ :extract_individuals ] + %W[ data/exomchip_snps_all_inter99_renamed.ped data/exomchip_snps_all_inter99_renamed.map ]

	# Rule to extract snps by positions
	plink_stems.each do \|stemkey,stem\|
	plink_file = stem + ".bed"
	new_plink_stem="data/#{stemkey}_snps_all"
	new_plink_file=new_plink_stem + ".ped"
	file new_plink_file => plink_file do
	sh "plink19 --bfile #{stem} --extract data/snp_ranges.txt --range --recode --out #{new_plink_stem}"
	end

	old_plink_stem=new_plink_stem
	new_plink_stem="data/#{stemkey}_snps_all_inter99"
	new_plink_file=new_plink_stem + ".ped"
	file new_plink_file => plink_file do
	sh "plink19 --file #{old_plink_stem} --keep data/keeplist.txt --recode --out #{new_plink_stem}"
	end
	end

	# Make ranges from position file
	file "data/snp_ranges.txt" => "position_list.txt" do
	sh "cat position_list.txt\|tr ':' ' '\|awk '{print $1,$2,$2,\"id\"}' > data/snp_ranges.txt"
	end

	file "data/keeplist.txt" => "I99Deltnr.txt" do
	sh "tail -n+3 I99Deltnr.txt \|grep \"^1000\" \|awk '{print $1, \"1\" }' > data/keeplist.txt"
	end

	file "data/exomchip_snps_all_inter99_renamed.ped" => "data/exomchip_snps_all_inter99.ped" do
	sh "cp data/exomchip_snps_all_inter99.ped data/exomchip_snps_all_inter99_renamed.ped"
	end

	file "data/exomchip_snps_all_inter99_renamed.map" => "data/exomchip_snps_all_inter99.map" do
	sh "cat data/exomchip_snps_all_inter99.map\|awk '{print $1,$1 \"_\" $4, $3, $4 }' > data/exomchip_snps_all_inter99_renamed.map"
	end

	file "data/merged.ped" => [ :rename_snps ] do
	sh "plink --file data/exomchip_snps_all_inter99_renamed --merge data/lucamp_snps_all_inter99.ped data/lucamp_snps_all_inter99.map --recode --out data/merged"
	end

	file "data/merged_012.ped" => "data/merged.ped" do
	sh "plink19 --file data/merged --recode 12 tab --out data/merged_012"
	end

	file "data/snps_table.tsv" => "data/merged_012.ped" do
	genotypes = {}
	snps = []
	File.open("data/merged_012.map") do \|file\|
	file.each do \|line\|
	snps << line.split(/\s/)[1].gsub(/(.+)_(.*)/, "chr\\1:\\2")
	end
	end

	File.open("data/merged_012.ped") do \|file\|
	file.each do \|line\|
	fields = line.split("\t")
	genotypes[fields[0]] = []
	6.upto(fields.size-1) do \|i\|
	genotypes[fields[0]] << case fields[i]
	when "2 2"
	0
	when "1 2"
	1
	when "1 1"
	2
	else
	"NA"
	end
	end
	end
	end

	# Write genotype table to file
	File.open("data/snps_table.tsv", "w") do \|file\|
	file.puts "particid\t" + snps.join("\t")
	genotypes.each do \|k,v\|
	file.puts k + "\t" + v.join("\t")
	end
	end
	end


	file "data/indels_table.tsv" => [ "data/lucamp_snps_all_inter99.ped", "SampleID_LibraryID.tsv", "DCM_indel_rare.csv" ] do
	positions = {}
	dcm_indel_map = []

	File.open("DCM_indel_rare.csv") do \|file\|
	lineno = 0
	file.each do \|line\|
	lineno = lineno + 1
	next unless lineno > 1
	chr, pos = *line.split(",")[0].split(":")
	if positions[chr].nil? then
	chr = "chr" + chr
	#chr = "chr" + chr.gsub("X","23")
	positions[chr] = [ pos.to_i ]
	else
	positions[chr] = ([ pos.to_i ] + positions[chr]).sort
	end

	dcm_indel_map << [ chr, pos.to_i ]
	end
	end

	particid_studyid = {}
	File.open("SampleID_LibraryID.tsv") do \|file\|
	lineno = 0
	file.each do \|line\|
	lineno = lineno + 1
	next unless lineno > 1
	fields = line.split(" ")
	particid_studyid[fields[0]] = fields[1]
	end
	end

	particids = []
	File.open("data/lucamp_snps_all_inter99.ped") do \|file\|
	file.each do \|line\|
	fields = line.split(" ")
	particids << fields[0]
	end

	end

	# Create arary to hold all genotypes
	genotypes = {}
	particids.each do \|p\|
	genotypes[p] = []
	dcm_indel_map.size.times { genotypes[p] << "NA" }
	end

	File.open("vcf_files.txt") do \|vcflist\|
	particids.each do \|p\|
	vcf = vcffile(particid_studyid,p)
	next if vcf.nil?
	dcm_indel_map.each_with_index do \|chrpos,idx\|
	chr,pos = chrpos
	geno=vcf_binary_search(vcf,chr,pos)
	genotypes[p][idx] = geno unless geno.nil?
	end
	end
	end

	# Write genotype table to file
	File.open("data/indels_table.tsv", "w") do \|file\|
	file.puts "particid\t" + (dcm_indel_map.collect { \|indel\| indel.first + ":" + indel.last.to_s }).join("\t")
	genotypes.each do \|k,v\|
	file.puts k + "\t" + v.join("\t")
	end
	end
	end

	file "data/merged_table.tsv" => [ "data/snps_table.tsv", "data/indels_table.tsv" ] do
	R.eval <<EOF
	snps=read.table("data/snps_table.tsv", h=T, sep="\t")
	indels=read.table("data/indels_table.tsv", h=T, sep="\t")
	write.table(merge(snps,indels,all=T),"data/merged_table.tsv", row.names=F, col.names=T, quot=F, sep="\\t")
	EOF
	sh "sed -i 's/\\./:/g' data/merged_table.tsv"
	end