Last active
October 12, 2021 18:23
-
-
Save Abusagit/9ce5acfa44deee1681a9f3c7d3c33580 to your computer and use it in GitHub Desktop.
Can be used from command line, takes 2 arguments - $1 - .vcf file path, $2 - .gff annotation file path and returns genes with variation with respect to reference genome.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!usr/bin/python | |
# -*- coding: utf-8 -*- | |
# | |
#@created: 09.10.2021 | |
#@author: Fyodor Velikonivtsev | |
#@contact: [email protected] | |
import bisect | |
import io | |
import os | |
import sys | |
import pandas as pd | |
import sys | |
pd.set_option('display.max_rows', 500) | |
pd.set_option('display.max_columns', 500) | |
pd.set_option('display.width', 1000) | |
def read_vcf(path): | |
with open(path, 'r') as f: | |
lines = tuple(l for l in f if not l.startswith('##')) # ignore meta information | |
return pd.read_csv(io.StringIO(''.join(lines)), dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, 'FILTER': str, 'INFO': str}, sep='\t').rename(columns={'#CHROM': 'CHROM'}) | |
def read_annotation(annotation_path): | |
gencode = pd.read_table(annotation_path, comment='#', sep='\t', names=['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']) | |
gencode_genes = gencode[gencode['feature'] == 'gene'][['seqname', 'start', 'end', 'attribute']].copy().reset_index() # TODO get rid of copy and collect info about localization too | |
gencode_genes.drop('index', axis='columns', inplace=True) | |
return gencode_genes | |
def get_mutations(gff_file, vcf_file): | |
genes_starts = tuple(gff_file.loc[:, 'start']) | |
mutated = [bisect.bisect_left(genes_starts, mutation_index) - 1 for mutation_index in vcf_file['POS']] | |
mutated_genes = gff_file.iloc[mutated].reset_index() | |
columns = {key: [] for key in ['ID', 'Dbxref', 'Name', 'gbkey', 'gene', 'gene_biotype', 'gene_synonym', 'locus_tag']} | |
genes_info = [gene.split(';') for gene in tuple(mutated_genes['attribute'])] | |
for gene in genes_info: | |
for attr in gene: | |
key, value = attr.split("=") | |
columns[key].append(value) | |
frame = pd.DataFrame.from_dict(columns).set_index('Name') | |
return frame | |
if __name__ == "__main__": | |
vcf_path, annotation_path = sys.argv[1:] | |
separator = f"{''.join('*' for _ in range(110))}" | |
print(separator, file=sys.stderr) | |
vcf_file = read_vcf(vcf_path) | |
print("Read .vcf file\nStarted reading .gff file...\n\n\n", file=sys.stderr) | |
gff_file = read_annotation(annotation_path) | |
print("Done!\nStarted building mutations dataframe...\n\n\n", file=sys.stderr) | |
print(get_mutations(gff_file, vcf_file), file=sys.stdout) | |
print(f"Done!\n{separator}", file=sys.stderr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment