Last active
October 26, 2015 10:00
-
-
Save cth/9f7b5ce47a6da8c8d8b9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
For imputed genotypes used the batch of 2015-04-27 which include DCH an other smaller cohorts. | |
This data is imputed using impute2 version 2.3.0 and phased using shapeit v2.r727. | |
Variants with p(HWE) < 0.000001 or callrate less than 95%, MAF less than %1 were removed prior | |
to impution. The panel used was 1000 Genomes phase 1. | |
The dosages all 48 SNPs were extracted from the VCF created from the imputed data | |
and merged with the 13 directly genotyped SNPs. Whenever, directly genotyped variants | |
are available they are used in place of the imputed dosages. | |
The number of inconsistencies between genotyped SNP and rounded imputed dosage are quite few, | |
ranging from 0 to 7. | |
The final data set contains 3579 individuals represented in a tab separated file where | |
each column correspond to a particular SNP. Dosages/genotypes are always reported for | |
the reference allele on plus strand of GRC37, i.e., 0 means homozygote reference allele. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ln -s /eva/users/fng514/imputation/batch-2015-04-27/VCFs | |
mkdir -p extracted/ | |
tail -n+2 summary_whr_snps.txt|awk '{print $2}'|tr -d '"' > rsnumbers.txt | |
tail -n+2 summary_whr_snps.txt|awk '{print $10,$11,$2}' |tr -d '"' > chr_pos_rs.txt | |
echo "6 32381736 rs7759742" >> chr_pos_rs.txt # position obtained from DBSNP | |
#grep -w -f rsnumbers.txt /eva/data/clean/DCH_Unik/chip/DCH_Unik_PLUS_b37_final.bim|awk '{print $2}' > on_chip_snps.txt | |
plink19 --bfile /eva/data/clean/DCH_Unik/chip/DCH_Unik_PLUS_b37_final --recode A tab --extract on_chip_snps.txt --out on_chip | |
python vcf_extractor.py | |
oneVCF=`ls extracted/*vcf|tr "\n" " "| cut -d" " -f1 ` | |
# header lines from one VCF file | |
cat $oneVCF | grep "^#.*$" | sort > merged.vcf | |
cat extracted/*vcf | grep -v "^#.*" |sort -n|uniq >> merged.vcf | |
cat on_chip.ped|cut -f1 > keep.txt | |
vcftools --vcf merged.vcf --keep keep.txt --recode --out 42x | |
cat merged.vcf|awk '{print $3,$8}'|cut -d";" -f1|tr '=' ' '|sed 's/exm-//g' > INFO_scores.txt | |
ruby merge_chip_impute.rb |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'bio-samtools' | |
individuals = [] | |
dosages = {} | |
snps = [] | |
ref_alleles = {} | |
File.open("42x.recode.vcf") do |vcf| | |
vcf.each do |line| | |
if line =~ /CHROM/ then | |
fields = line.chomp.split("\t") | |
9.upto(fields.size-1) do |i| | |
individuals << fields[i] | |
end | |
end | |
vcfline=Bio::DB::Vcf.new(line) | |
next if vcfline.samples.size == 0 | |
rs = vcfline.id.gsub("exm-", "") | |
ref_alleles[rs] = vcfline.ref | |
snps << rs | |
1.upto(individuals.size) do |i| | |
if dosages[rs].nil? then | |
dosages[rs] = [ vcfline.samples[i.to_s]["DS"].to_f ] | |
else | |
dosages[rs] << vcfline.samples[i.to_s]["DS"].to_f | |
end | |
end | |
end | |
end | |
inconsistent = {} | |
map_rs = [] | |
alleles = [] | |
File.open("on_chip.raw") do |rawfile| | |
rawfile.each do |line| | |
fields = line.chomp.split("\t") | |
puts fields[0] | |
if fields[0] == "FID" then | |
puts "here" | |
6.times { fields.shift } | |
map_rs = fields.collect { |x| x.gsub("exm-", "").gsub(/(.*)_(.)/,"\\1") } | |
alleles = fields.collect { |x| x.gsub("exm-", "").gsub(/(.*)_(.)/,"\\2") } | |
map_rs.each { |a| inconsistent[a] = 0 } | |
puts "maprs: " + map_rs.inspect | |
puts "alleles: " + alleles.inspect | |
next | |
end | |
indv = fields.shift | |
5.times { fields.shift } | |
s = 0 | |
throw "bad indv #{indv}" unless individuals.include?(indv) | |
0.upto(fields.size-1) do |s| | |
rs = map_rs[s] | |
alelle = alleles[s] | |
geno = fields[s] | |
throw "bad snp" unless dosages.keys.include?(rs) | |
old_dosage = dosages[rs][individuals.rindex(indv)] | |
if alelle != ref_alleles[rs] | |
dosages[rs][individuals.rindex(indv)] = geno.to_f | |
else | |
dosages[rs][individuals.rindex(indv)] = 2-geno.to_f | |
end | |
if old_dosage.round != dosages[rs][individuals.rindex(indv)] | |
inconsistent[rs] = inconsistent[rs] + 1 | |
end | |
end | |
end | |
end | |
puts inconsistent.inspect | |
File.open("merged_impute_chip.csv", "w") do |outfile| | |
# Write output as csv file | |
outfile.puts (["particid"] + (ref_alleles.collect {|x| x.join("_") })).join("\t") | |
0.upto(individuals.size-1) do |i| | |
outfile.puts ([individuals[i]] + snps.collect { |s| dosages[s][i] }).join("\t") | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
#$ -S /usr/bin/python | |
#$ -cwd | |
import os | |
import re | |
extractList = open('chr_pos_rs.txt', 'r') | |
vcfs = os.listdir('VCFs/') | |
#commandStart = 'plink19 --vcf VCFs/' | |
commandStart = "/home/fng514/bin/vcftools --gzvcf VCFs/" | |
for line in extractList: | |
splitLine = line.split() | |
lineChr = int(splitLine[0]) | |
linePos = int(splitLine[1]) | |
lineRS = splitLine[2] | |
print(lineRS) | |
for j in vcfs: | |
if(re.search('.tbi',j)==None): | |
continue | |
j = re.sub('.tbi','',j) | |
i = j.split('.') | |
chr=int(i[2]) | |
batch = i[3].split('-') | |
startPos=int(batch[0]) | |
endPos=int(batch[1]) | |
if(chr==lineChr): | |
if(linePos < endPos and linePos > startPos): | |
print(j) | |
print("extracted/"+lineRS+".recode.vcf") | |
if os.path.isfile('extracted/'+lineRS+'.log'): | |
print('log file exists, checking extraction was successful') | |
if(re.search("kept 1", open('extracted/'+lineRS+'.log').read())!=None): | |
print("it was :-)") | |
continue | |
else: | |
print("it was NOT") | |
os.system(commandStart + j + ' --chr ' + str(chr) + ' --from-bp ' + str(linePos-1) + ' --to-bp ' + str(linePos+1) + ' --recode --recode-INFO-all --out extracted/' + lineRS) | |
else: | |
os.system(commandStart + j + ' --snp ' + lineRS + ' --recode --recode-INFO-all --out extracted/' + lineRS) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment