Created
August 20, 2015 08:35
-
-
Save cth/d6c93a188c612b31750f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'rinruby' | |
require 'bio-samtools' | |
require 'rake' | |
plink_stems = { | |
:lucamp => "/eva/brutus/J8/data/deepExom/burden5/clean/lrt24qibin7R2BeagleV2", | |
# :metabochip => "/eva/brutus/J8/data/metaboChip/clean/metaboChipAll", | |
:exomchip => "/eva/brutus/J8/data/exomChip/clean/exomChipAll" | |
} | |
class String | |
def chr_i | |
i=self.gsub("chr","") | |
if i=="X" then | |
23 | |
elsif i.to_i > 0 | |
i.to_i | |
else | |
1000 | |
end | |
end | |
end | |
def vcffile(mapper,particid) | |
return nil if mapper[particid].nil? | |
file="/eva/data/lpj264_temporary/gvcf_controls/" + mapper[particid] + ".g.vcf" | |
if File.exists?(file) | |
file | |
else | |
nil | |
end | |
end | |
def vcf_binary_search(file, chr, pos, proximity=1000) | |
gvcf=File.open(file) | |
gvcf.seek(0, IO::SEEK_END) | |
gvcf_size=gvcf.pos | |
index=gvcf_size / 2 | |
change = index / 2 | |
previous_index = 0 | |
geno = nil | |
loop do | |
#throw "error" if previous_index == index | |
previous_index = index | |
gvcf.seek(index, IO::SEEK_SET) | |
# We might be in the middle of a line skip to start of next line | |
gvcf.gets | |
line = gvcf.gets | |
vcfline=Bio::DB::Vcf.new(line) | |
if previous_index == index or (chr == vcfline.chrom and (pos - vcfline.pos).abs < proximity) | |
# seek backwards if we went slightly to long | |
gvcf.seek(index-20000,IO::SEEK_SET) if pos - vcfline.pos < 0 | |
gvcf.readline | |
loop do | |
line = gvcf.readline | |
vcfline=Bio::DB::Vcf.new(line) | |
if vcfline.pos = pos or (vcfline.pos > pos and !vcfline.info["END"].nil? and vcfline.info["END"].to_i <= pos) | |
if vcfline.samples["1"]["GT"] == "0/0" | |
geno = 0 | |
elsif vcfline.samples["1"]["GT"] == "0/1" or vcfline.samples["1"]["GT"] == "0|1" | |
geno = 1 | |
puts vcfline.inspect | |
elsif vcfline.samples["1"]["GT"] == "1/1" or vcfline.samples["1"]["GT"] == "1|1" | |
geno = 1 | |
puts vcfline.inspect | |
end | |
break | |
end | |
end | |
break | |
elsif chr.chr_i < vcfline.chrom.chr_i then | |
index = index - change | |
elsif chr.chr_i > vcfline.chrom.chr_i then | |
index = index + change | |
elsif pos < vcfline.pos then | |
index = index - change | |
elsif pos > vcfline.pos then | |
index = index + change | |
end | |
change = change / 2 | |
end | |
gvcf.close | |
return geno | |
end | |
############### | |
# Rake stuff | |
task :extract_snps_by_position => %W[ data/snp_ranges.txt data/lucamp_snps_all.ped data/exomchip_snps_all.ped ] | |
task :extract_individuals => [ :extract_snps_by_position ] + %W[ data/lucamp_snps_all_inter99.ped data/exomchip_snps_all_inter99.ped ] | |
task :rename_snps => [ :extract_individuals ] + %W[ data/exomchip_snps_all_inter99_renamed.ped data/exomchip_snps_all_inter99_renamed.map ] | |
# Rule to extract snps by positions | |
plink_stems.each do |stemkey,stem| | |
plink_file = stem + ".bed" | |
new_plink_stem="data/#{stemkey}_snps_all" | |
new_plink_file=new_plink_stem + ".ped" | |
file new_plink_file => plink_file do | |
sh "plink19 --bfile #{stem} --extract data/snp_ranges.txt --range --recode --out #{new_plink_stem}" | |
end | |
old_plink_stem=new_plink_stem | |
new_plink_stem="data/#{stemkey}_snps_all_inter99" | |
new_plink_file=new_plink_stem + ".ped" | |
file new_plink_file => plink_file do | |
sh "plink19 --file #{old_plink_stem} --keep data/keeplist.txt --recode --out #{new_plink_stem}" | |
end | |
end | |
# Make ranges from position file | |
file "data/snp_ranges.txt" => "position_list.txt" do | |
sh "cat position_list.txt|tr ':' ' '|awk '{print $1,$2,$2,\"id\"}' > data/snp_ranges.txt" | |
end | |
file "data/keeplist.txt" => "I99Deltnr.txt" do | |
sh "tail -n+3 I99Deltnr.txt |grep \"^1000\" |awk '{print $1, \"1\" }' > data/keeplist.txt" | |
end | |
file "data/exomchip_snps_all_inter99_renamed.ped" => "data/exomchip_snps_all_inter99.ped" do | |
sh "cp data/exomchip_snps_all_inter99.ped data/exomchip_snps_all_inter99_renamed.ped" | |
end | |
file "data/exomchip_snps_all_inter99_renamed.map" => "data/exomchip_snps_all_inter99.map" do | |
sh "cat data/exomchip_snps_all_inter99.map|awk '{print $1,$1 \"_\" $4, $3, $4 }' > data/exomchip_snps_all_inter99_renamed.map" | |
end | |
file "data/merged.ped" => [ :rename_snps ] do | |
sh "plink --file data/exomchip_snps_all_inter99_renamed --merge data/lucamp_snps_all_inter99.ped data/lucamp_snps_all_inter99.map --recode --out data/merged" | |
end | |
file "data/merged_012.ped" => "data/merged.ped" do | |
sh "plink19 --file data/merged --recode 12 tab --out data/merged_012" | |
end | |
file "data/snps_table.tsv" => "data/merged_012.ped" do | |
genotypes = {} | |
snps = [] | |
File.open("data/merged_012.map") do |file| | |
file.each do |line| | |
snps << line.split(/\s/)[1].gsub(/(.+)_(.*)/, "chr\\1:\\2") | |
end | |
end | |
File.open("data/merged_012.ped") do |file| | |
file.each do |line| | |
fields = line.split("\t") | |
genotypes[fields[0]] = [] | |
6.upto(fields.size-1) do |i| | |
genotypes[fields[0]] << case fields[i] | |
when "2 2" | |
0 | |
when "1 2" | |
1 | |
when "1 1" | |
2 | |
else | |
"NA" | |
end | |
end | |
end | |
end | |
# Write genotype table to file | |
File.open("data/snps_table.tsv", "w") do |file| | |
file.puts "particid\t" + snps.join("\t") | |
genotypes.each do |k,v| | |
file.puts k + "\t" + v.join("\t") | |
end | |
end | |
end | |
file "data/indels_table.tsv" => [ "data/lucamp_snps_all_inter99.ped", "SampleID_LibraryID.tsv", "DCM_indel_rare.csv" ] do | |
positions = {} | |
dcm_indel_map = [] | |
File.open("DCM_indel_rare.csv") do |file| | |
lineno = 0 | |
file.each do |line| | |
lineno = lineno + 1 | |
next unless lineno > 1 | |
chr, pos = *line.split(",")[0].split(":") | |
if positions[chr].nil? then | |
chr = "chr" + chr | |
#chr = "chr" + chr.gsub("X","23") | |
positions[chr] = [ pos.to_i ] | |
else | |
positions[chr] = ([ pos.to_i ] + positions[chr]).sort | |
end | |
dcm_indel_map << [ chr, pos.to_i ] | |
end | |
end | |
particid_studyid = {} | |
File.open("SampleID_LibraryID.tsv") do |file| | |
lineno = 0 | |
file.each do |line| | |
lineno = lineno + 1 | |
next unless lineno > 1 | |
fields = line.split(" ") | |
particid_studyid[fields[0]] = fields[1] | |
end | |
end | |
particids = [] | |
File.open("data/lucamp_snps_all_inter99.ped") do |file| | |
file.each do |line| | |
fields = line.split(" ") | |
particids << fields[0] | |
end | |
end | |
# Create arary to hold all genotypes | |
genotypes = {} | |
particids.each do |p| | |
genotypes[p] = [] | |
dcm_indel_map.size.times { genotypes[p] << "NA" } | |
end | |
File.open("vcf_files.txt") do |vcflist| | |
particids.each do |p| | |
vcf = vcffile(particid_studyid,p) | |
next if vcf.nil? | |
dcm_indel_map.each_with_index do |chrpos,idx| | |
chr,pos = chrpos | |
geno=vcf_binary_search(vcf,chr,pos) | |
genotypes[p][idx] = geno unless geno.nil? | |
end | |
end | |
end | |
# Write genotype table to file | |
File.open("data/indels_table.tsv", "w") do |file| | |
file.puts "particid\t" + (dcm_indel_map.collect { |indel| indel.first + ":" + indel.last.to_s }).join("\t") | |
genotypes.each do |k,v| | |
file.puts k + "\t" + v.join("\t") | |
end | |
end | |
end | |
file "data/merged_table.tsv" => [ "data/snps_table.tsv", "data/indels_table.tsv" ] do | |
R.eval <<EOF | |
snps=read.table("data/snps_table.tsv", h=T, sep="\t") | |
indels=read.table("data/indels_table.tsv", h=T, sep="\t") | |
write.table(merge(snps,indels,all=T),"data/merged_table.tsv", row.names=F, col.names=T, quot=F, sep="\\t") | |
EOF | |
sh "sed -i 's/\\./:/g' data/merged_table.tsv" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment