Skip to content

Instantly share code, notes, and snippets.

@genomewalker
Last active August 5, 2024 07:47
Show Gist options
  • Save genomewalker/fa1e589595b44eedf0b16384fa8c4559 to your computer and use it in GitHub Desktop.
Save genomewalker/fa1e589595b44eedf0b16384fa8c4559 to your computer and use it in GitHub Desktop.
Evaluate miniprot results

Let's process the PAF output from miniprot to get some stats:

for i in *paf; do python ../paf-stats.py -i ${i} -o ${i/paf/tsv} ; done
import re
import pandas as pd
import argparse
import json
from collections import defaultdict
import sys
class MiniprotStats:
def __init__(self, paf_file, translation_table):
self.paf_file = paf_file
self.translation_table, self.initiators = self.get_translation_table(
translation_table
)
self.stats = defaultdict(int)
self.records = []
# From: https://github.com/linsalrob/genetic_codes/blob/main/pygenetic_code/genetic_code.py
def three_letters_to_one_letter(self):
return {
"Ala": "A",
"Arg": "R",
"Asn": "N",
"Asp": "D",
"Cys": "C",
"Gln": "Q",
"Glu": "E",
"Gly": "G",
"His": "H",
"Ile": "I",
"Leu": "L",
"Lys": "K",
"Met": "M",
"Phe": "F",
"Pro": "P",
"Ser": "S",
"Thr": "T",
"Trp": "W",
"Tyr": "Y",
"Val": "V",
"Ter": "*",
}
def get_translation_table(self, table_id):
genetic_codes = json.loads(
"""
{
"1":
{
"initiators": ["TTG", "CTG", "ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"2":
{
"initiators": ["GTG", "ATC", "ATA", "ATT", "ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Met", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Ter", "AGT": "Ser", "AGG": "Ter", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"3":
{
"initiators": ["ATA", "ATG", "GTG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Met", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Thr", "CTT": "Thr", "CTG": "Thr", "CTC": "Thr",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"4":
{
"initiators": ["GTG", "TTA", "ATC", "ATA", "ATT", "TTG", "CTG", "ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"5":
{
"initiators": ["GTG", "ATC", "ATA", "ATT", "TTG", "ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Met", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Ser", "AGT": "Ser", "AGG": "Ser", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"6":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Gln", "TAT": "Tyr", "TAG": "Gln", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"9":
{
"initiators": ["ATG", "GTG"],
"codons":
{
"AAA": "Asn", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Ser", "AGT": "Ser", "AGG": "Ser", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"10":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Cys", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"11":
{
"initiators": ["GTG", "ATC", "ATA", "ATT", "TTG", "CTG", "ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"12":
{
"initiators": ["CTG", "ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Ser", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"13":
{
"initiators": ["TTG", "ATA", "ATG", "GTG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Met", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Gly", "AGT": "Ser", "AGG": "Gly", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"14":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Asn", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Ser", "AGT": "Ser", "AGG": "Ser", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Tyr", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"15":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Gln", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"16":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Leu", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"21":
{
"initiators": ["ATG", "GTG"],
"codons":
{
"AAA": "Asn", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Met", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Ser", "AGT": "Ser", "AGG": "Ser", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"22":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Leu", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ter", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"23":
{
"initiators": ["ATG", "ATT", "GTG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Ter", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"24":
{
"initiators": ["TTG", "CTG", "ATG", "GTG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Ser", "AGT": "Ser", "AGG": "Lys", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"25":
{
"initiators": ["TTG", "ATG", "GTG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Gly", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"26":
{
"initiators": ["CTG", "ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Ter", "TAT": "Tyr", "TAG": "Ter", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Ala", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"27":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Gln", "TAT": "Tyr", "TAG": "Gln", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"28":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Gln", "TAT": "Tyr", "TAG": "Gln", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"29":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Tyr", "TAT": "Tyr", "TAG": "Tyr", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"30":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Glu", "TAT": "Tyr", "TAG": "Glu", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Ter", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
},
"31":
{
"initiators": ["ATG"],
"codons":
{
"AAA": "Lys", "AAT": "Asn", "AAG": "Lys", "AAC": "Asn",
"ATA": "Ile", "ATT": "Ile", "ATG": "Met", "ATC": "Ile",
"AGA": "Arg", "AGT": "Ser", "AGG": "Arg", "AGC": "Ser",
"ACA": "Thr", "ACT": "Thr", "ACG": "Thr", "ACC": "Thr",
"TAA": "Glu", "TAT": "Tyr", "TAG": "Glu", "TAC": "Tyr",
"TTA": "Leu", "TTT": "Phe", "TTG": "Leu", "TTC": "Phe",
"TGA": "Trp", "TGT": "Cys", "TGG": "Trp", "TGC": "Cys",
"TCA": "Ser", "TCT": "Ser", "TCG": "Ser", "TCC": "Ser",
"GAA": "Glu", "GAT": "Asp", "GAG": "Glu", "GAC": "Asp",
"GTA": "Val", "GTT": "Val", "GTG": "Val", "GTC": "Val",
"GGA": "Gly", "GGT": "Gly", "GGG": "Gly", "GGC": "Gly",
"GCA": "Ala", "GCT": "Ala", "GCG": "Ala", "GCC": "Ala",
"CAA": "Gln", "CAT": "His", "CAG": "Gln", "CAC": "His",
"CTA": "Leu", "CTT": "Leu", "CTG": "Leu", "CTC": "Leu",
"CGA": "Arg", "CGT": "Arg", "CGG": "Arg", "CGC": "Arg",
"CCA": "Pro", "CCT": "Pro", "CCG": "Pro", "CCC": "Pro"
}
}
}
"""
)
if str(table_id) not in genetic_codes.keys():
print(f"ERROR: Genetic code table {table_id} not found")
sys.exit(1)
table = genetic_codes[str(table_id)]
initiators = table["initiators"]
# convert initiators to lowercase
initiators = [initiator.lower() for initiator in initiators]
codons = table["codons"]
# convert codons to lowercase
codons = {codon.lower(): aa for codon, aa in codons.items()}
# convert aa to one-letter code
three2one = self.three_letters_to_one_letter()
codons = {codon: three2one[aa] for codon, aa in codons.items()}
return codons, initiators
def parse_paf(self):
with open(self.paf_file, "r") as file:
for line in file:
fields = line.strip().split("\t")
if len(fields) < 12:
continue # Ignore incomplete lines
self.process_fields(fields)
def process_fields(self, fields):
record = {
"query_name": fields[0],
"query_length": int(fields[1]),
"query_start": int(fields[2]),
"query_end": int(fields[3]),
"strand": fields[4],
"target_name": fields[5],
"target_length": int(fields[6]),
"target_start": int(fields[7]),
"target_end": int(fields[8]),
"residue_matches": int(fields[9]),
"alignment_block_length": int(fields[10]),
"mapping_quality": int(fields[11]),
}
tags = fields[12:]
frameshift_deletion_events = 0
frameshift_match_events = 0
frameshift_deletion_nucleotides = 0
frameshift_match_nucleotides = 0
frameshift_match_amino_acids = 0
inframe_stop_count = 0
synonymous = 0
non_synonymous = 0
insertion_count = 0
deletion_count = 0
substitution_count = 0
other_event_count = 0
truncation = False
for tag in tags:
if tag.startswith("cg:Z:"):
events, del_nt, match_nt, match_aa = self.count_frameshifts(tag[5:])
frameshift_deletion_events += events["deletion"]
frameshift_match_events += events["match"]
frameshift_deletion_nucleotides += del_nt
frameshift_match_nucleotides += match_nt
frameshift_match_amino_acids += match_aa
elif tag.startswith("cs:Z:"):
syn, non_syn, ins, del_, sub, other, is_trunc, stops = (
self.parse_cs_tag(tag[5:])
)
synonymous += syn
non_synonymous += non_syn
insertion_count += ins
deletion_count += del_
substitution_count += sub
other_event_count += other
inframe_stop_count += stops
# Check for truncation: presence of frameshifts or in-frame stop codons
if inframe_stop_count > 0:
truncation = True
record.update(
{
"frameshift_deletion_events": frameshift_deletion_events,
"frameshift_match_events": frameshift_match_events,
"frameshift_deletion_nucleotides": frameshift_deletion_nucleotides,
"frameshift_match_nucleotides": frameshift_match_nucleotides,
"frameshift_match_amino_acids": frameshift_match_amino_acids,
"inframe_stops": inframe_stop_count,
"synonymous": synonymous,
"non_synonymous": non_synonymous,
"insertions": insertion_count,
"deletions": deletion_count,
"substitutions": substitution_count,
"other_events": other_event_count,
"truncated": truncation,
"query_coverage": (record["query_end"] - record["query_start"])
/ record["query_length"],
"target_coverage": (record["target_end"] - record["target_start"])
/ record["target_length"],
"identity_percentage": record["residue_matches"]
/ record["alignment_block_length"],
}
)
self.records.append(record)
def count_frameshifts(self, cigar):
events = {"deletion": 0, "match": 0}
deletion_nucleotides = 0
match_nucleotides = 0
match_amino_acids = 0
for match in re.finditer(r"([0-9]+)([FfGg])", cigar):
num = int(match.group(1))
if match.group(2) in "Ff":
events["deletion"] += 1
deletion_nucleotides += num
elif match.group(2) in "Gg":
events["match"] += 1
match_nucleotides += num
match_amino_acids += 1
return events, deletion_nucleotides, match_nucleotides, match_amino_acids
def parse_cs_tag(self, cs_tag):
synonymous = 0
non_synonymous = 0
insertion_count = 0
deletion_count = 0
substitution_count = 0
other_event_count = 0
inframe_stop_count = 0
truncation = False
pattern = re.compile(
r"(:[0-9]+|\*[acgtn]+\S|\+[A-Z]+|\-[acgtn]+|~[acgtn]{2}[0-9]+[acgtn]{2})"
)
cs_parts = pattern.findall(cs_tag)
for part in cs_parts:
if part.startswith(":"):
continue
elif part.startswith("*"):
if len(part[1:]) >= 4: # e.g., '*aaax'
nt = part[1:4]
aa_ref = part[4]
syn = self.is_synonymous(nt, aa_ref)
if syn:
synonymous += 1
else:
non_synonymous += 1
substitution_count += 1
aa_translated = self.translate_codon(nt)
if aa_translated == "Stop": # Stop codon
inframe_stop_count += 1
elif part.startswith("+"):
insertion_count += len(part[1:])
elif part.startswith("-"):
deletion_count += len(part[1:])
elif part.startswith("~"):
other_event_count += 1
return (
synonymous,
non_synonymous,
insertion_count,
deletion_count,
substitution_count,
other_event_count,
truncation,
inframe_stop_count,
)
def is_synonymous(self, nt_seq, aa_ref):
nt_codon = nt_seq.lower()
aa_residue = self.translation_table.get(nt_codon, "*")
return aa_residue == aa_ref
def translate_codon(self, nt_codon):
aa = self.translation_table.get(nt_codon.lower(), "*")
if aa == "*":
return "Stop"
return aa
def to_dataframe(self):
return pd.DataFrame(self.records)
def save_stats(self, output_file):
df = self.to_dataframe()
df.to_csv(output_file, index=False, sep="\t")
def main(input_file, output_file, translation_table):
miniprot_stats = MiniprotStats(input_file, translation_table)
miniprot_stats.parse_paf()
miniprot_stats.save_stats(output_file)
print(f"Statistics saved to {output_file}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Summarize statistics from a PAF file."
)
parser.add_argument("-i", "--input", required=True, help="Input PAF file")
parser.add_argument("-o", "--output", required=True, help="Output CSV file")
parser.add_argument(
"-t",
"--translation_table",
type=int,
default=1,
help="NCBI translation table ID",
)
args = parser.parse_args()
main(args.input, args.output, args.translation_table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment