Skip to content

Instantly share code, notes, and snippets.

@Abusagit
Last active October 12, 2021 18:23
Show Gist options
  • Save Abusagit/9ce5acfa44deee1681a9f3c7d3c33580 to your computer and use it in GitHub Desktop.
Save Abusagit/9ce5acfa44deee1681a9f3c7d3c33580 to your computer and use it in GitHub Desktop.
Can be used from command line, takes 2 arguments - $1 - .vcf file path, $2 - .gff annotation file path and returns genes with variation with respect to reference genome.
#!usr/bin/python
# -*- coding: utf-8 -*-
#
#@created: 09.10.2021
#@author: Fyodor Velikonivtsev
#@contact: [email protected]
import bisect
import io
import os
import sys
import pandas as pd
import sys
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
def read_vcf(path):
with open(path, 'r') as f:
lines = tuple(l for l in f if not l.startswith('##')) # ignore meta information
return pd.read_csv(io.StringIO(''.join(lines)), dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t').rename(columns={'#CHROM': 'CHROM'})
def read_annotation(annotation_path):
gencode = pd.read_table(annotation_path, comment='#', sep='\t', names=['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])
gencode_genes = gencode[gencode['feature'] == 'gene'][['seqname', 'start', 'end', 'attribute']].copy().reset_index() # TODO get rid of copy and collect info about localization too
gencode_genes.drop('index', axis='columns', inplace=True)
return gencode_genes
def get_mutations(gff_file, vcf_file):
genes_starts = tuple(gff_file.loc[:, 'start'])
mutated = [bisect.bisect_left(genes_starts, mutation_index) - 1 for mutation_index in vcf_file['POS']]
mutated_genes = gff_file.iloc[mutated].reset_index()
columns = {key: [] for key in ['ID', 'Dbxref', 'Name', 'gbkey', 'gene', 'gene_biotype', 'gene_synonym', 'locus_tag']}
genes_info = [gene.split(';') for gene in tuple(mutated_genes['attribute'])]
for gene in genes_info:
for attr in gene:
key, value = attr.split("=")
columns[key].append(value)
frame = pd.DataFrame.from_dict(columns).set_index('Name')
return frame
if __name__ == "__main__":
vcf_path, annotation_path = sys.argv[1:]
separator = f"{''.join('*' for _ in range(110))}"
print(separator, file=sys.stderr)
vcf_file = read_vcf(vcf_path)
print("Read .vcf file\nStarted reading .gff file...\n\n\n", file=sys.stderr)
gff_file = read_annotation(annotation_path)
print("Done!\nStarted building mutations dataframe...\n\n\n", file=sys.stderr)
print(get_mutations(gff_file, vcf_file), file=sys.stdout)
print(f"Done!\n{separator}", file=sys.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment