Skip to content

Instantly share code, notes, and snippets.

@nilesh-tawari
Last active July 29, 2019 06:55
Show Gist options
  • Save nilesh-tawari/344f9b87f0a0337d920448e3e549453d to your computer and use it in GitHub Desktop.
Save nilesh-tawari/344f9b87f0a0337d920448e3e549453d to your computer and use it in GitHub Desktop.
Convert vep annotated vcf file to pandas dataframe
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 5 14:21:42 2018
@author: nilesh-tawari
email: [email protected]
GitHub: https://github.com/nilesh-tawari
"""
from __future__ import print_function
import os
import argparse
import numpy as np
import pandas as pd
from collections import OrderedDict
import gzip
'''
Convert VEP annotated VCF file in pandas dataframe
usage: python vepvcf_to_pandas.py -v test.vcf
'''
# Parameters
parser = argparse.ArgumentParser(description = 'Parse VCF output from VEP' \
'to generate pandas dataframes')
parser.add_argument('-v', '--vcf', required=True, help='Input vep annotated vcf file name')
args = parser.parse_args()
vep_anno_vcf = os.path.abspath(args.vcf)
def calc_af(row, AD, AO, DP):
'''
Function to calculate AF from AO and DP columns or AD column alone
'''
try:
if row[AD] == '-':
val = float(row[AO]) / float(row[DP])
else:
val = float(row[AD].split(',')[1]) / (float(row[AD].split(',')[0]) + float(row[AD].split(',')[1]))
except IndexError:
val = '.'
return val
def vcf_to_df(vcf_file, remove_cols = False):
'''
file -> df
Opens vcf file and gets comments, column names for the file and vep headers and puts data in
a pandas dataframe. Format column is splited based on samples.
'''
if vcf_file.endswith('.gz'):
vcf_file_read = gzip.open(vcf_file, 'r')
else:
vcf_file_read = open(vcf_file, 'r')
vcf_comments = []
vep_columns = ''
for line in vcf_file_read:
if vcf_file.endswith('.gz'):
line = line.decode()
if line.startswith('#'):
vcf_comments.append(line)
if line.startswith('##') and "Format:" and "CSQ" in line:
vep_columns = [line[:-3].split('Format:')[1].strip().split('|')]
vep_columns = [item for slist in vep_columns for item in slist]
elif line.startswith('#CHROM'):
vcf_columns = [line.strip().strip('#').split('\t')]
vcf_columns = [item for slist in vcf_columns for item in slist]
vcf_file_read.close()
sample_ids = vcf_columns[9:]
df = pd.read_csv(vcf_file, sep = '\t', comment = '#', chunksize=1000, \
low_memory=False, names = vcf_columns, iterator = True)
df = pd.concat(list(df), ignore_index=True)
for sample in sample_ids:
df_info = pd.DataFrame(
df.apply(lambda x: OrderedDict(zip(x['FORMAT'].split(':'),
x[sample].split(':'))), 1).tolist(), index = df.index)
# calculate AF
AD, AO, DP = sample + ':AD', sample + ':AO', sample + ':DP'
df_info = df_info.add_prefix(sample + ':')
df_info[sample + ':calcAF'] = df_info.apply(calc_af, args=(AD, AO, DP) , axis=1)
df = pd.merge(left = df, right = df_info, left_index = True, \
right_index = True, how = 'outer')
df.fillna(value='-', inplace = True)
if remove_cols:
df.drop(sample_ids + ['FORMAT'], inplace = True, axis = 1, errors='ignore')
info = df.INFO.str.extractall('([^;]+)=([^;]+)')
info = pd.Series(info.values[:, 1], [info.index.get_level_values(0) \
, info.values[:, 0]]).unstack()
df = pd.merge(left = df, right = info, left_index = True, \
right_index = True, how = 'outer', suffixes=['', 'tag'])
if remove_cols:
df.drop(['INFO'], inplace = True, axis = 1, errors='ignore')
df['variant_ID'] = df.CHROM.astype(str) + '_' + df.POS.astype(str) + '_' + \
df.REF.astype(str) + '/' + df.ALT.astype(str)
return vcf_comments, vep_columns, vcf_columns, sample_ids, df
def expand_csq(df, vep_columns, sort_pick=True):
'''
df -> df
expand VEP consequence
'''
df_tr = pd.DataFrame(df.CSQ.str.split(',', expand = True).stack().str.split('|', expand = True))
df_tr.columns = vep_columns
if 'gnomADg' in vep_columns:
df_tr['MAX_AF_genomADg'] = df_tr[[col for col in vep_columns if 'gnomADg_' in col or col == 'MAX_AF']].apply(pd.to_numeric, errors='coerce').max(axis=1)
df_tr['MAX_AF_genomADg_pop'] = df_tr[[col for col in vep_columns if 'gnomADg_' in col or col == 'MAX_AF']].apply(pd.to_numeric, errors='coerce').replace('', np.nan).idxmax(axis=1)
if 'PICK' in vep_columns and sort_pick:
df_tr['PICK'].replace('-', 0, inplace=True)
df_tr['PICK'] = df_tr['PICK'].apply(pd.to_numeric)
df_tr.groupby(['Feature']).apply(lambda x: x.sort_values(['PICK'], ascending = False)).reset_index(drop=True)
df_tr['PICK'].replace({0: '-', 1: '1'}, inplace=True)
df_tr.index = df_tr.index.droplevel(-1)
df_tr.replace('', '-', inplace = True)
df_tr.fillna(value='-', inplace = True)
df_tr['ID'] = df_tr.index
df_all_trans = pd.merge(left = df, right = df_tr, left_index = True, \
right_on = 'ID', how = 'outer', suffixes = ["", "_vep"])
df_all_trans.drop(['ID_vep', 'CSQ'], inplace = True, axis = 1, errors = 'ignore')
df_single_trans = df_all_trans[~df_all_trans.index.duplicated()]
df_snv = df_single_trans.loc[df_single_trans['VARIANT_CLASS'] == 'SNV']
df_indel = df_single_trans.loc[~(df_single_trans['VARIANT_CLASS'] == 'SNV')]
return df_single_trans, df_all_trans, df_snv, df_indel
vcf_comments, vep_columns, vcf_columns, sample_ids, df = vcf_to_df(vep_anno_vcf, remove_cols=True)
df_single_trans, df_all_trans, df_snv, df_indel = expand_csq(df, vep_columns, sort_pick=True)
## following dataframes are produced
# df: unexpanded VCF columns
# df_single_trans: fully expanded single transcripts vcf
# df_all_trans: fully expanded all transcripts vcf
# df_snv: df with snvs
# df_indel = df with indels
@xiucz
Copy link

xiucz commented Jul 29, 2019

When I want to pharse annovar annatated vcf using line 85, 86:

info = pd.Series(info.values[:, 1], [info.index.get_level_values(0) \
                    , info.values[:, 0]]).unstack()

I got stuck here.

ValueError: Index contains duplicate entries, cannot reshape

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment