Skip to content

Instantly share code, notes, and snippets.

@wangfan860
Last active March 14, 2019 17:31
Show Gist options
  • Save wangfan860/cc97b12e0e7a8aa9a915709f0f0ac410 to your computer and use it in GitHub Desktop.
Save wangfan860/cc97b12e0e7a8aa9a915709f0f0ac410 to your computer and use it in GitHub Desktop.
haplotype_summary
#be sure to have scikit-allel, pandas, glob intalled
import allel
import pandas as pd
import glob
#change this line to match the path for vcf.gz files
all_files = glob.glob("*.vcf.gz")
#very first vcf for merging
file1 = allel.read_vcf(all_files[0])
sample= ''.join(file1['samples'])
genotype=file1['calldata/GT']
t_shape=genotype.shape
genotype_2D=genotype.reshape(t_shape[0],t_shape[2])
genotype_str=['{}/{}'.format(e[0],e[1]) for e in genotype_2D]
chrom=file1['variants/CHROM']
position=file1['variants/POS']
dataset_f1 = pd.DataFrame({'CHR':chrom,'POS':position,sample:genotype_str})
dataset_f1['CHR_POS']=dataset_f1['CHR'].astype(str)+'_'+dataset_f1['POS'].astype(str)
final_dataset_f1=dataset_f1[['CHR_POS',sample]]
#file_dataset_f1 will be the merged genotype info for all samples
for filename in all_files[1:]:
print(filename)
df = allel.read_vcf(filename)
sample= ''.join(df['samples'])
genotype=df['calldata/GT']
t_shape=genotype.shape
genotype_2D=genotype.reshape(t_shape[0],t_shape[2])
genotype_str=['{}/{}'.format(e[0],e[1]) for e in genotype_2D]
chrom=df['variants/CHROM']
position=df['variants/POS']
dataset = pd.DataFrame({'CHR':chrom,'POS':position,sample:genotype_str})
dataset['CHR_POS']=dataset['CHR'].astype(str)+'_'+dataset['POS'].astype(str)
final_dataset=dataset[['CHR_POS',sample]]
final_dataset_f1 =pd.merge(final_dataset_f1, final_dataset, how='outer', on='CHR_POS')
final_dataset_f1.to_csv('/mnt/SCRATCH/genomel_summary/genotype_1305patient.csv')
#summary will be the count of homozygous/heterozygous occurences for each snp
summary=final_dataset_f1[['CHR_POS']]
summary['1/1']=(final_dataset_f1 == '1/1').T.sum()
summary['0/1']=(final_dataset_f1 == '0/1').T.sum()
summary['0/0']=(final_dataset_f1 == '0/0').T.sum()
summary.to_csv('/mnt/SCRATCH/genomel_summary/genotype_occurency_1305patient.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment