wangfan860 · March 14, 2019 17:31
diff --git a/haplotype_summary.py b/haplotype_summary.py
 #be sure to have scikit-allel, pandas, glob intalled
 import allel
 import pandas as pd
 import glob

 #change this line to match the path for vcf.gz files
 all_files = glob.glob("*.vcf.gz")

 #very first vcf for merging
 file1 = allel.read_vcf(all_files[0])
 sample= ''.join(file1['samples'])
 genotype=file1['calldata/GT']
 t_shape=genotype.shape
 genotype_2D=genotype.reshape(t_shape[0],t_shape[2])
 genotype_str=['{}/{}'.format(e[0],e[1]) for e in genotype_2D]
 chrom=file1['variants/CHROM']
 position=file1['variants/POS']
 dataset_f1 = pd.DataFrame({'CHR':chrom,'POS':position,sample:genotype_str})
 dataset_f1['CHR_POS']=dataset_f1['CHR'].astype(str)+'_'+dataset_f1['POS'].astype(str)
 final_dataset_f1=dataset_f1[['CHR_POS',sample]]

 #file_dataset_f1 will be the merged genotype info for all samples
 for filename in all_files[1:]:
    print(filename)
    df = allel.read_vcf(filename)
    sample= ''.join(df['samples'])
    genotype=df['calldata/GT']
    t_shape=genotype.shape
    genotype_2D=genotype.reshape(t_shape[0],t_shape[2])
    genotype_str=['{}/{}'.format(e[0],e[1]) for e in genotype_2D]
    chrom=df['variants/CHROM']
    position=df['variants/POS']
    dataset = pd.DataFrame({'CHR':chrom,'POS':position,sample:genotype_str})
    dataset['CHR_POS']=dataset['CHR'].astype(str)+'_'+dataset['POS'].astype(str)
    final_dataset=dataset[['CHR_POS',sample]]
    final_dataset_f1 =pd.merge(final_dataset_f1, final_dataset, how='outer', on='CHR_POS')
 final_dataset_f1.to_csv('/mnt/SCRATCH/genomel_summary/genotype_1305patient.csv')
 #summary will be the count of homozygous/heterozygous occurences for each snp
 summary=final_dataset_f1[['CHR_POS']]
 summary['1/1']=(final_dataset_f1 == '1/1').T.sum()
 summary['0/1']=(final_dataset_f1 == '0/1').T.sum()
 summary['0/0']=(final_dataset_f1 == '0/0').T.sum()
 summary.to_csv('/mnt/SCRATCH/genomel_summary/genotype_occurency_1305patient.csv')
	#be sure to have scikit-allel, pandas, glob intalled
	import allel
	import pandas as pd
	import glob

	#change this line to match the path for vcf.gz files
	all_files = glob.glob("*.vcf.gz")

	#very first vcf for merging
	file1 = allel.read_vcf(all_files[0])
	sample= ''.join(file1['samples'])
	genotype=file1['calldata/GT']
	t_shape=genotype.shape
	genotype_2D=genotype.reshape(t_shape[0],t_shape[2])
	genotype_str=['{}/{}'.format(e[0],e[1]) for e in genotype_2D]
	chrom=file1['variants/CHROM']
	position=file1['variants/POS']
	dataset_f1 = pd.DataFrame({'CHR':chrom,'POS':position,sample:genotype_str})
	dataset_f1['CHR_POS']=dataset_f1['CHR'].astype(str)+'_'+dataset_f1['POS'].astype(str)
	final_dataset_f1=dataset_f1[['CHR_POS',sample]]

	#file_dataset_f1 will be the merged genotype info for all samples
	for filename in all_files[1:]:
	print(filename)
	df = allel.read_vcf(filename)
	sample= ''.join(df['samples'])
	genotype=df['calldata/GT']
	t_shape=genotype.shape
	genotype_2D=genotype.reshape(t_shape[0],t_shape[2])
	genotype_str=['{}/{}'.format(e[0],e[1]) for e in genotype_2D]
	chrom=df['variants/CHROM']
	position=df['variants/POS']
	dataset = pd.DataFrame({'CHR':chrom,'POS':position,sample:genotype_str})
	dataset['CHR_POS']=dataset['CHR'].astype(str)+'_'+dataset['POS'].astype(str)
	final_dataset=dataset[['CHR_POS',sample]]
	final_dataset_f1 =pd.merge(final_dataset_f1, final_dataset, how='outer', on='CHR_POS')
	final_dataset_f1.to_csv('/mnt/SCRATCH/genomel_summary/genotype_1305patient.csv')
	#summary will be the count of homozygous/heterozygous occurences for each snp
	summary=final_dataset_f1[['CHR_POS']]
	summary['1/1']=(final_dataset_f1 == '1/1').T.sum()
	summary['0/1']=(final_dataset_f1 == '0/1').T.sum()
	summary['0/0']=(final_dataset_f1 == '0/0').T.sum()
	summary.to_csv('/mnt/SCRATCH/genomel_summary/genotype_occurency_1305patient.csv')