Skip to content

Instantly share code, notes, and snippets.

@cth
Last active October 26, 2015 10:00
Show Gist options
  • Save cth/9f7b5ce47a6da8c8d8b9 to your computer and use it in GitHub Desktop.
Save cth/9f7b5ce47a6da8c8d8b9 to your computer and use it in GitHub Desktop.
For imputed genotypes used the batch of 2015-04-27 which include DCH an other smaller cohorts.
This data is imputed using impute2 version 2.3.0 and phased using shapeit v2.r727.
Variants with p(HWE) < 0.000001 or callrate less than 95%, MAF less than %1 were removed prior
to impution. The panel used was 1000 Genomes phase 1.
The dosages all 48 SNPs were extracted from the VCF created from the imputed data
and merged with the 13 directly genotyped SNPs. Whenever, directly genotyped variants
are available they are used in place of the imputed dosages.
The number of inconsistencies between genotyped SNP and rounded imputed dosage are quite few,
ranging from 0 to 7.
The final data set contains 3579 individuals represented in a tab separated file where
each column correspond to a particular SNP. Dosages/genotypes are always reported for
the reference allele on plus strand of GRC37, i.e., 0 means homozygote reference allele.
ln -s /eva/users/fng514/imputation/batch-2015-04-27/VCFs
mkdir -p extracted/
tail -n+2 summary_whr_snps.txt|awk '{print $2}'|tr -d '"' > rsnumbers.txt
tail -n+2 summary_whr_snps.txt|awk '{print $10,$11,$2}' |tr -d '"' > chr_pos_rs.txt
echo "6 32381736 rs7759742" >> chr_pos_rs.txt # position obtained from DBSNP
#grep -w -f rsnumbers.txt /eva/data/clean/DCH_Unik/chip/DCH_Unik_PLUS_b37_final.bim|awk '{print $2}' > on_chip_snps.txt
plink19 --bfile /eva/data/clean/DCH_Unik/chip/DCH_Unik_PLUS_b37_final --recode A tab --extract on_chip_snps.txt --out on_chip
python vcf_extractor.py
oneVCF=`ls extracted/*vcf|tr "\n" " "| cut -d" " -f1 `
# header lines from one VCF file
cat $oneVCF | grep "^#.*$" | sort > merged.vcf
cat extracted/*vcf | grep -v "^#.*" |sort -n|uniq >> merged.vcf
cat on_chip.ped|cut -f1 > keep.txt
vcftools --vcf merged.vcf --keep keep.txt --recode --out 42x
cat merged.vcf|awk '{print $3,$8}'|cut -d";" -f1|tr '=' ' '|sed 's/exm-//g' > INFO_scores.txt
ruby merge_chip_impute.rb
require 'bio-samtools'
individuals = []
dosages = {}
snps = []
ref_alleles = {}
File.open("42x.recode.vcf") do |vcf|
vcf.each do |line|
if line =~ /CHROM/ then
fields = line.chomp.split("\t")
9.upto(fields.size-1) do |i|
individuals << fields[i]
end
end
vcfline=Bio::DB::Vcf.new(line)
next if vcfline.samples.size == 0
rs = vcfline.id.gsub("exm-", "")
ref_alleles[rs] = vcfline.ref
snps << rs
1.upto(individuals.size) do |i|
if dosages[rs].nil? then
dosages[rs] = [ vcfline.samples[i.to_s]["DS"].to_f ]
else
dosages[rs] << vcfline.samples[i.to_s]["DS"].to_f
end
end
end
end
inconsistent = {}
map_rs = []
alleles = []
File.open("on_chip.raw") do |rawfile|
rawfile.each do |line|
fields = line.chomp.split("\t")
puts fields[0]
if fields[0] == "FID" then
puts "here"
6.times { fields.shift }
map_rs = fields.collect { |x| x.gsub("exm-", "").gsub(/(.*)_(.)/,"\\1") }
alleles = fields.collect { |x| x.gsub("exm-", "").gsub(/(.*)_(.)/,"\\2") }
map_rs.each { |a| inconsistent[a] = 0 }
puts "maprs: " + map_rs.inspect
puts "alleles: " + alleles.inspect
next
end
indv = fields.shift
5.times { fields.shift }
s = 0
throw "bad indv #{indv}" unless individuals.include?(indv)
0.upto(fields.size-1) do |s|
rs = map_rs[s]
alelle = alleles[s]
geno = fields[s]
throw "bad snp" unless dosages.keys.include?(rs)
old_dosage = dosages[rs][individuals.rindex(indv)]
if alelle != ref_alleles[rs]
dosages[rs][individuals.rindex(indv)] = geno.to_f
else
dosages[rs][individuals.rindex(indv)] = 2-geno.to_f
end
if old_dosage.round != dosages[rs][individuals.rindex(indv)]
inconsistent[rs] = inconsistent[rs] + 1
end
end
end
end
puts inconsistent.inspect
File.open("merged_impute_chip.csv", "w") do |outfile|
# Write output as csv file
outfile.puts (["particid"] + (ref_alleles.collect {|x| x.join("_") })).join("\t")
0.upto(individuals.size-1) do |i|
outfile.puts ([individuals[i]] + snps.collect { |s| dosages[s][i] }).join("\t")
end
end
#!/usr/bin/python
#$ -S /usr/bin/python
#$ -cwd
import os
import re
extractList = open('chr_pos_rs.txt', 'r')
vcfs = os.listdir('VCFs/')
#commandStart = 'plink19 --vcf VCFs/'
commandStart = "/home/fng514/bin/vcftools --gzvcf VCFs/"
for line in extractList:
splitLine = line.split()
lineChr = int(splitLine[0])
linePos = int(splitLine[1])
lineRS = splitLine[2]
print(lineRS)
for j in vcfs:
if(re.search('.tbi',j)==None):
continue
j = re.sub('.tbi','',j)
i = j.split('.')
chr=int(i[2])
batch = i[3].split('-')
startPos=int(batch[0])
endPos=int(batch[1])
if(chr==lineChr):
if(linePos < endPos and linePos > startPos):
print(j)
print("extracted/"+lineRS+".recode.vcf")
if os.path.isfile('extracted/'+lineRS+'.log'):
print('log file exists, checking extraction was successful')
if(re.search("kept 1", open('extracted/'+lineRS+'.log').read())!=None):
print("it was :-)")
continue
else:
print("it was NOT")
os.system(commandStart + j + ' --chr ' + str(chr) + ' --from-bp ' + str(linePos-1) + ' --to-bp ' + str(linePos+1) + ' --recode --recode-INFO-all --out extracted/' + lineRS)
else:
os.system(commandStart + j + ' --snp ' + lineRS + ' --recode --recode-INFO-all --out extracted/' + lineRS)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment