cth · October 26, 2015 10:00
diff --git a/README b/README


 For imputed genotypes used the batch of 2015-04-27 which include DCH an other smaller cohorts.
 This data is imputed using impute2 version 2.3.0 and phased using shapeit v2.r727. 
 Variants with p(HWE) < 0.000001 or callrate less than 95%, MAF less than %1 were removed prior 
 to impution. The panel used was 1000 Genomes phase 1.

 The dosages all 48 SNPs were extracted from the VCF created from the imputed data 
 and merged with the 13 directly genotyped SNPs. Whenever, directly genotyped variants
 are available they are used in place of the imputed dosages.
 The number of inconsistencies between genotyped SNP and rounded imputed dosage are quite few, 
 ranging from 0 to 7.

 The final data set contains 3579 individuals represented in a tab separated file where 
 each column correspond to a particular SNP.  Dosages/genotypes are always reported for
 the reference allele on plus strand of GRC37, i.e., 0 means homozygote reference allele.
diff --git a/go.sh b/go.sh
 ln -s /eva/users/fng514/imputation/batch-2015-04-27/VCFs
 mkdir -p extracted/

 tail -n+2 summary_whr_snps.txt|awk '{print $2}'|tr -d '"' > rsnumbers.txt
 tail -n+2 summary_whr_snps.txt|awk '{print $10,$11,$2}' |tr -d '"' > chr_pos_rs.txt
 echo "6 32381736 rs7759742" >> chr_pos_rs.txt # position obtained from DBSNP
 #grep -w -f rsnumbers.txt /eva/data/clean/DCH_Unik/chip/DCH_Unik_PLUS_b37_final.bim|awk '{print $2}' > on_chip_snps.txt
 plink19 --bfile /eva/data/clean/DCH_Unik/chip/DCH_Unik_PLUS_b37_final --recode A tab --extract on_chip_snps.txt --out on_chip

 python vcf_extractor.py



 oneVCF=`ls extracted/*vcf|tr "\n" " "| cut -d" " -f1 `
 # header lines from one VCF file
 cat $oneVCF | grep "^#.*$" | sort > merged.vcf 
 cat extracted/*vcf | grep -v "^#.*" |sort -n|uniq >> merged.vcf
 cat on_chip.ped|cut -f1 > keep.txt
 vcftools --vcf merged.vcf --keep keep.txt --recode --out 42x 

 cat merged.vcf|awk '{print $3,$8}'|cut -d";" -f1|tr '=' ' '|sed 's/exm-//g' > INFO_scores.txt

 ruby merge_chip_impute.rb
diff --git a/merge_chip_impute.rb b/merge_chip_impute.rb
 require 'bio-samtools'

 individuals = []	
 dosages = {}
 snps = []

 ref_alleles = {} 

 File.open("42x.recode.vcf") do |vcf| 
 	vcf.each do |line|
 		if line =~ /CHROM/ then
 			fields = line.chomp.split("\t")
 			9.upto(fields.size-1) do |i|
 				individuals << fields[i]
 			end
 		end

 		vcfline=Bio::DB::Vcf.new(line)
 		next if vcfline.samples.size == 0

 		rs = vcfline.id.gsub("exm-", "")
 		ref_alleles[rs] = vcfline.ref 

 		snps << rs 

 		1.upto(individuals.size) do |i|
 			if dosages[rs].nil? then
 				dosages[rs] = [ vcfline.samples[i.to_s]["DS"].to_f ]
 			else
 				dosages[rs] << vcfline.samples[i.to_s]["DS"].to_f
 			end
 		end
 	end
 end

 inconsistent = {}
 map_rs = []
 alleles = []

 File.open("on_chip.raw") do |rawfile|
 	rawfile.each do |line|
 		fields = line.chomp.split("\t") 
 		puts fields[0]
 		if fields[0] == "FID" then
 			puts "here"
 			6.times { fields.shift }
 			map_rs = fields.collect { |x| x.gsub("exm-", "").gsub(/(.*)_(.)/,"\\1") }
 			alleles = fields.collect { |x| x.gsub("exm-", "").gsub(/(.*)_(.)/,"\\2") }
 			map_rs.each { |a| inconsistent[a] = 0 }
 			puts "maprs: " + map_rs.inspect
 			puts "alleles: " + alleles.inspect

 			next
 		end

 		indv = fields.shift
 		5.times { fields.shift }
 		s = 0

 		throw "bad indv #{indv}" unless individuals.include?(indv)

 		0.upto(fields.size-1) do |s| 
 			rs = map_rs[s]
 			alelle = alleles[s]
 			geno = fields[s]
 			throw "bad snp" unless dosages.keys.include?(rs)
 			
 			old_dosage = dosages[rs][individuals.rindex(indv)]
 			if alelle != ref_alleles[rs]
 				dosages[rs][individuals.rindex(indv)] = geno.to_f 
 			else
 				dosages[rs][individuals.rindex(indv)] = 2-geno.to_f 
 			end
 			if old_dosage.round != dosages[rs][individuals.rindex(indv)]
 				inconsistent[rs] = inconsistent[rs] + 1
 			end
 		end
 	end
 end

 puts inconsistent.inspect

 File.open("merged_impute_chip.csv", "w") do |outfile|
 	# Write output as csv file
 	outfile.puts (["particid"] + (ref_alleles.collect {|x| x.join("_") })).join("\t")
 	0.upto(individuals.size-1) do |i|
 		outfile.puts ([individuals[i]] + snps.collect { |s| dosages[s][i] }).join("\t")
 	end
 end
diff --git a/vcf_extractor.py b/vcf_extractor.py
 #!/usr/bin/python
 #$ -S /usr/bin/python
 #$ -cwd 

 import os
 import re

 extractList = open('chr_pos_rs.txt', 'r')
 vcfs = os.listdir('VCFs/')

 #commandStart = 'plink19 --vcf VCFs/'
 commandStart = "/home/fng514/bin/vcftools --gzvcf VCFs/"

 for line in extractList:
    splitLine = line.split()
    lineChr = int(splitLine[0])
    linePos = int(splitLine[1])
    lineRS = splitLine[2]
    print(lineRS)
    for j in vcfs:
 	if(re.search('.tbi',j)==None):
            continue
        j = re.sub('.tbi','',j)
        
        i = j.split('.')
        chr=int(i[2])

        batch = i[3].split('-')
        startPos=int(batch[0])
        endPos=int(batch[1])

        if(chr==lineChr):
            if(linePos < endPos and linePos > startPos):
                print(j)
                print("extracted/"+lineRS+".recode.vcf")
                if os.path.isfile('extracted/'+lineRS+'.log'):
                    print('log file exists, checking extraction was successful') 
                    if(re.search("kept 1", open('extracted/'+lineRS+'.log').read())!=None):
                        print("it was :-)")   
                        continue
                    else:
                        print("it was NOT")   
                        os.system(commandStart + j + ' --chr ' + str(chr) + ' --from-bp ' + str(linePos-1) + ' --to-bp ' + str(linePos+1) + ' --recode --recode-INFO-all --out extracted/' + lineRS)
                else:
                    os.system(commandStart + j + ' --snp ' + lineRS + ' --recode --recode-INFO-all --out extracted/' + lineRS)


	For imputed genotypes used the batch of 2015-04-27 which include DCH an other smaller cohorts.
	This data is imputed using impute2 version 2.3.0 and phased using shapeit v2.r727.
	Variants with p(HWE) < 0.000001 or callrate less than 95%, MAF less than %1 were removed prior
	to impution. The panel used was 1000 Genomes phase 1.

	The dosages all 48 SNPs were extracted from the VCF created from the imputed data
	and merged with the 13 directly genotyped SNPs. Whenever, directly genotyped variants
	are available they are used in place of the imputed dosages.
	The number of inconsistencies between genotyped SNP and rounded imputed dosage are quite few,
	ranging from 0 to 7.

	The final data set contains 3579 individuals represented in a tab separated file where
	each column correspond to a particular SNP. Dosages/genotypes are always reported for
	the reference allele on plus strand of GRC37, i.e., 0 means homozygote reference allele.
	ln -s /eva/users/fng514/imputation/batch-2015-04-27/VCFs
	mkdir -p extracted/

	tail -n+2 summary_whr_snps.txt\|awk '{print $2}'\|tr -d '"' > rsnumbers.txt
	tail -n+2 summary_whr_snps.txt\|awk '{print $10,$11,$2}' \|tr -d '"' > chr_pos_rs.txt
	echo "6 32381736 rs7759742" >> chr_pos_rs.txt # position obtained from DBSNP
	#grep -w -f rsnumbers.txt /eva/data/clean/DCH_Unik/chip/DCH_Unik_PLUS_b37_final.bim\|awk '{print $2}' > on_chip_snps.txt
	plink19 --bfile /eva/data/clean/DCH_Unik/chip/DCH_Unik_PLUS_b37_final --recode A tab --extract on_chip_snps.txt --out on_chip

	python vcf_extractor.py



	oneVCF=`ls extracted/*vcf\|tr "\n" " "\| cut -d" " -f1 `
	# header lines from one VCF file
	cat $oneVCF \| grep "^#.*$" \| sort > merged.vcf
	cat extracted/vcf \| grep -v "^#." \|sort -n\|uniq >> merged.vcf
	cat on_chip.ped\|cut -f1 > keep.txt
	vcftools --vcf merged.vcf --keep keep.txt --recode --out 42x

	cat merged.vcf\|awk '{print $3,$8}'\|cut -d";" -f1\|tr '=' ' '\|sed 's/exm-//g' > INFO_scores.txt

	ruby merge_chip_impute.rb
	require 'bio-samtools'

	individuals = []
	dosages = {}
	snps = []

	ref_alleles = {}

	File.open("42x.recode.vcf") do \|vcf\|
	vcf.each do \|line\|
	if line =~ /CHROM/ then
	fields = line.chomp.split("\t")
	9.upto(fields.size-1) do \|i\|
	individuals << fields[i]
	end
	end

	vcfline=Bio::DB::Vcf.new(line)
	next if vcfline.samples.size == 0

	rs = vcfline.id.gsub("exm-", "")
	ref_alleles[rs] = vcfline.ref

	snps << rs

	1.upto(individuals.size) do \|i\|
	if dosages[rs].nil? then
	dosages[rs] = [ vcfline.samples[i.to_s]["DS"].to_f ]
	else
	dosages[rs] << vcfline.samples[i.to_s]["DS"].to_f
	end
	end
	end
	end

	inconsistent = {}
	map_rs = []
	alleles = []

	File.open("on_chip.raw") do \|rawfile\|
	rawfile.each do \|line\|
	fields = line.chomp.split("\t")
	puts fields[0]
	if fields[0] == "FID" then
	puts "here"
	6.times { fields.shift }
	map_rs = fields.collect { \|x\| x.gsub("exm-", "").gsub(/(.*)_(.)/,"\\1") }
	alleles = fields.collect { \|x\| x.gsub("exm-", "").gsub(/(.*)_(.)/,"\\2") }
	map_rs.each { \|a\| inconsistent[a] = 0 }
	puts "maprs: " + map_rs.inspect
	puts "alleles: " + alleles.inspect

	next
	end

	indv = fields.shift
	5.times { fields.shift }
	s = 0

	throw "bad indv #{indv}" unless individuals.include?(indv)

	0.upto(fields.size-1) do \|s\|
	rs = map_rs[s]
	alelle = alleles[s]
	geno = fields[s]
	throw "bad snp" unless dosages.keys.include?(rs)

	old_dosage = dosages[rs][individuals.rindex(indv)]
	if alelle != ref_alleles[rs]
	dosages[rs][individuals.rindex(indv)] = geno.to_f
	else
	dosages[rs][individuals.rindex(indv)] = 2-geno.to_f
	end
	if old_dosage.round != dosages[rs][individuals.rindex(indv)]
	inconsistent[rs] = inconsistent[rs] + 1
	end
	end
	end
	end

	puts inconsistent.inspect

	File.open("merged_impute_chip.csv", "w") do \|outfile\|
	# Write output as csv file
	outfile.puts (["particid"] + (ref_alleles.collect {\|x\| x.join("_") })).join("\t")
	0.upto(individuals.size-1) do \|i\|
	outfile.puts ([individuals[i]] + snps.collect { \|s\| dosages[s][i] }).join("\t")
	end
	end
	#!/usr/bin/python
	#$ -S /usr/bin/python
	#$ -cwd

	import os
	import re

	extractList = open('chr_pos_rs.txt', 'r')
	vcfs = os.listdir('VCFs/')

	#commandStart = 'plink19 --vcf VCFs/'
	commandStart = "/home/fng514/bin/vcftools --gzvcf VCFs/"

	for line in extractList:
	splitLine = line.split()
	lineChr = int(splitLine[0])
	linePos = int(splitLine[1])
	lineRS = splitLine[2]
	print(lineRS)
	for j in vcfs:
	if(re.search('.tbi',j)==None):
	continue
	j = re.sub('.tbi','',j)

	i = j.split('.')
	chr=int(i[2])

	batch = i[3].split('-')
	startPos=int(batch[0])
	endPos=int(batch[1])

	if(chr==lineChr):
	if(linePos < endPos and linePos > startPos):
	print(j)
	print("extracted/"+lineRS+".recode.vcf")
	if os.path.isfile('extracted/'+lineRS+'.log'):
	print('log file exists, checking extraction was successful')
	if(re.search("kept 1", open('extracted/'+lineRS+'.log').read())!=None):
	print("it was :-)")
	continue
	else:
	print("it was NOT")
	os.system(commandStart + j + ' --chr ' + str(chr) + ' --from-bp ' + str(linePos-1) + ' --to-bp ' + str(linePos+1) + ' --recode --recode-INFO-all --out extracted/' + lineRS)
	else:
	os.system(commandStart + j + ' --snp ' + lineRS + ' --recode --recode-INFO-all --out extracted/' + lineRS)