Skip to content

Instantly share code, notes, and snippets.

@cth
Created August 20, 2015 08:35
Show Gist options
  • Save cth/d6c93a188c612b31750f to your computer and use it in GitHub Desktop.
Save cth/d6c93a188c612b31750f to your computer and use it in GitHub Desktop.
require 'rinruby'
require 'bio-samtools'
require 'rake'
plink_stems = {
:lucamp => "/eva/brutus/J8/data/deepExom/burden5/clean/lrt24qibin7R2BeagleV2",
# :metabochip => "/eva/brutus/J8/data/metaboChip/clean/metaboChipAll",
:exomchip => "/eva/brutus/J8/data/exomChip/clean/exomChipAll"
}
class String
def chr_i
i=self.gsub("chr","")
if i=="X" then
23
elsif i.to_i > 0
i.to_i
else
1000
end
end
end
def vcffile(mapper,particid)
return nil if mapper[particid].nil?
file="/eva/data/lpj264_temporary/gvcf_controls/" + mapper[particid] + ".g.vcf"
if File.exists?(file)
file
else
nil
end
end
def vcf_binary_search(file, chr, pos, proximity=1000)
gvcf=File.open(file)
gvcf.seek(0, IO::SEEK_END)
gvcf_size=gvcf.pos
index=gvcf_size / 2
change = index / 2
previous_index = 0
geno = nil
loop do
#throw "error" if previous_index == index
previous_index = index
gvcf.seek(index, IO::SEEK_SET)
# We might be in the middle of a line skip to start of next line
gvcf.gets
line = gvcf.gets
vcfline=Bio::DB::Vcf.new(line)
if previous_index == index or (chr == vcfline.chrom and (pos - vcfline.pos).abs < proximity)
# seek backwards if we went slightly to long
gvcf.seek(index-20000,IO::SEEK_SET) if pos - vcfline.pos < 0
gvcf.readline
loop do
line = gvcf.readline
vcfline=Bio::DB::Vcf.new(line)
if vcfline.pos = pos or (vcfline.pos > pos and !vcfline.info["END"].nil? and vcfline.info["END"].to_i <= pos)
if vcfline.samples["1"]["GT"] == "0/0"
geno = 0
elsif vcfline.samples["1"]["GT"] == "0/1" or vcfline.samples["1"]["GT"] == "0|1"
geno = 1
puts vcfline.inspect
elsif vcfline.samples["1"]["GT"] == "1/1" or vcfline.samples["1"]["GT"] == "1|1"
geno = 1
puts vcfline.inspect
end
break
end
end
break
elsif chr.chr_i < vcfline.chrom.chr_i then
index = index - change
elsif chr.chr_i > vcfline.chrom.chr_i then
index = index + change
elsif pos < vcfline.pos then
index = index - change
elsif pos > vcfline.pos then
index = index + change
end
change = change / 2
end
gvcf.close
return geno
end
###############
# Rake stuff
task :extract_snps_by_position => %W[ data/snp_ranges.txt data/lucamp_snps_all.ped data/exomchip_snps_all.ped ]
task :extract_individuals => [ :extract_snps_by_position ] + %W[ data/lucamp_snps_all_inter99.ped data/exomchip_snps_all_inter99.ped ]
task :rename_snps => [ :extract_individuals ] + %W[ data/exomchip_snps_all_inter99_renamed.ped data/exomchip_snps_all_inter99_renamed.map ]
# Rule to extract snps by positions
plink_stems.each do |stemkey,stem|
plink_file = stem + ".bed"
new_plink_stem="data/#{stemkey}_snps_all"
new_plink_file=new_plink_stem + ".ped"
file new_plink_file => plink_file do
sh "plink19 --bfile #{stem} --extract data/snp_ranges.txt --range --recode --out #{new_plink_stem}"
end
old_plink_stem=new_plink_stem
new_plink_stem="data/#{stemkey}_snps_all_inter99"
new_plink_file=new_plink_stem + ".ped"
file new_plink_file => plink_file do
sh "plink19 --file #{old_plink_stem} --keep data/keeplist.txt --recode --out #{new_plink_stem}"
end
end
# Make ranges from position file
file "data/snp_ranges.txt" => "position_list.txt" do
sh "cat position_list.txt|tr ':' ' '|awk '{print $1,$2,$2,\"id\"}' > data/snp_ranges.txt"
end
file "data/keeplist.txt" => "I99Deltnr.txt" do
sh "tail -n+3 I99Deltnr.txt |grep \"^1000\" |awk '{print $1, \"1\" }' > data/keeplist.txt"
end
file "data/exomchip_snps_all_inter99_renamed.ped" => "data/exomchip_snps_all_inter99.ped" do
sh "cp data/exomchip_snps_all_inter99.ped data/exomchip_snps_all_inter99_renamed.ped"
end
file "data/exomchip_snps_all_inter99_renamed.map" => "data/exomchip_snps_all_inter99.map" do
sh "cat data/exomchip_snps_all_inter99.map|awk '{print $1,$1 \"_\" $4, $3, $4 }' > data/exomchip_snps_all_inter99_renamed.map"
end
file "data/merged.ped" => [ :rename_snps ] do
sh "plink --file data/exomchip_snps_all_inter99_renamed --merge data/lucamp_snps_all_inter99.ped data/lucamp_snps_all_inter99.map --recode --out data/merged"
end
file "data/merged_012.ped" => "data/merged.ped" do
sh "plink19 --file data/merged --recode 12 tab --out data/merged_012"
end
file "data/snps_table.tsv" => "data/merged_012.ped" do
genotypes = {}
snps = []
File.open("data/merged_012.map") do |file|
file.each do |line|
snps << line.split(/\s/)[1].gsub(/(.+)_(.*)/, "chr\\1:\\2")
end
end
File.open("data/merged_012.ped") do |file|
file.each do |line|
fields = line.split("\t")
genotypes[fields[0]] = []
6.upto(fields.size-1) do |i|
genotypes[fields[0]] << case fields[i]
when "2 2"
0
when "1 2"
1
when "1 1"
2
else
"NA"
end
end
end
end
# Write genotype table to file
File.open("data/snps_table.tsv", "w") do |file|
file.puts "particid\t" + snps.join("\t")
genotypes.each do |k,v|
file.puts k + "\t" + v.join("\t")
end
end
end
file "data/indels_table.tsv" => [ "data/lucamp_snps_all_inter99.ped", "SampleID_LibraryID.tsv", "DCM_indel_rare.csv" ] do
positions = {}
dcm_indel_map = []
File.open("DCM_indel_rare.csv") do |file|
lineno = 0
file.each do |line|
lineno = lineno + 1
next unless lineno > 1
chr, pos = *line.split(",")[0].split(":")
if positions[chr].nil? then
chr = "chr" + chr
#chr = "chr" + chr.gsub("X","23")
positions[chr] = [ pos.to_i ]
else
positions[chr] = ([ pos.to_i ] + positions[chr]).sort
end
dcm_indel_map << [ chr, pos.to_i ]
end
end
particid_studyid = {}
File.open("SampleID_LibraryID.tsv") do |file|
lineno = 0
file.each do |line|
lineno = lineno + 1
next unless lineno > 1
fields = line.split(" ")
particid_studyid[fields[0]] = fields[1]
end
end
particids = []
File.open("data/lucamp_snps_all_inter99.ped") do |file|
file.each do |line|
fields = line.split(" ")
particids << fields[0]
end
end
# Create arary to hold all genotypes
genotypes = {}
particids.each do |p|
genotypes[p] = []
dcm_indel_map.size.times { genotypes[p] << "NA" }
end
File.open("vcf_files.txt") do |vcflist|
particids.each do |p|
vcf = vcffile(particid_studyid,p)
next if vcf.nil?
dcm_indel_map.each_with_index do |chrpos,idx|
chr,pos = chrpos
geno=vcf_binary_search(vcf,chr,pos)
genotypes[p][idx] = geno unless geno.nil?
end
end
end
# Write genotype table to file
File.open("data/indels_table.tsv", "w") do |file|
file.puts "particid\t" + (dcm_indel_map.collect { |indel| indel.first + ":" + indel.last.to_s }).join("\t")
genotypes.each do |k,v|
file.puts k + "\t" + v.join("\t")
end
end
end
file "data/merged_table.tsv" => [ "data/snps_table.tsv", "data/indels_table.tsv" ] do
R.eval <<EOF
snps=read.table("data/snps_table.tsv", h=T, sep="\t")
indels=read.table("data/indels_table.tsv", h=T, sep="\t")
write.table(merge(snps,indels,all=T),"data/merged_table.tsv", row.names=F, col.names=T, quot=F, sep="\\t")
EOF
sh "sed -i 's/\\./:/g' data/merged_table.tsv"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment