Last active
January 6, 2023 22:10
-
-
Save JoaoRodrigues/e3a4f2139d10888c679eb1657a4d7080 to your computer and use it in GitHub Desktop.
Sequence-based structure alignment of protein structures with Biopython
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
"""Sequence-based structural alignment of two proteins.""" | |
import argparse | |
import pathlib | |
from Bio.PDB import FastMMCIFParser, MMCIFIO, PDBParser, PDBIO, Superimposer | |
from Bio.PDB.Polypeptide import is_aa | |
from Bio import pairwise2 | |
from Bio.Align import substitution_matrices | |
from Bio.Data.SCOPData import protein_letters_3to1 as aa3to1 | |
def align_sequences(structA, structB): | |
""" | |
Performs a global pairwise alignment between two sequences | |
using the BLOSUM62 matrix and the Needleman-Wunsch algorithm | |
as implemented in Biopython. Returns the alignment, the sequence | |
identity and the residue mapping between both original sequences. | |
""" | |
def _get_pdb_sequence(structure): | |
""" | |
Retrieves the AA sequence from a PDB structure. | |
""" | |
_aainfo = lambda r: (r.id[1], aa3to1.get(r.resname, "X")) | |
seq = [_aainfo(r) for r in structure.get_residues() if is_aa(r)] | |
return seq | |
resseq_A = _get_pdb_sequence(structA) | |
resseq_B = _get_pdb_sequence(structB) | |
sequence_A = "".join([i[1] for i in resseq_A]) | |
sequence_B = "".join([i[1] for i in resseq_B]) | |
alns = pairwise2.align.globalds( | |
sequence_A, | |
sequence_B, | |
substitution_matrices.load("BLOSUM62"), | |
one_alignment_only=True, | |
open=-10.0, | |
extend=-0.5, | |
penalize_end_gaps=(False, False), | |
) | |
best_aln = alns[0] | |
aligned_A, aligned_B, score, begin, end = best_aln | |
# Equivalent residue numbering | |
# Relative to reference | |
mapping = {} | |
aa_i_A, aa_i_B = 0, 0 | |
for aln_i, (aa_aln_A, aa_aln_B) in enumerate(zip(aligned_A, aligned_B)): | |
if aa_aln_A == "-": | |
if aa_aln_B != "-": | |
aa_i_B += 1 | |
elif aa_aln_B == "-": | |
if aa_aln_A != "-": | |
aa_i_A += 1 | |
else: | |
assert resseq_A[aa_i_A][1] == aa_aln_A | |
assert resseq_B[aa_i_B][1] == aa_aln_B | |
mapping[resseq_A[aa_i_A][0]] = resseq_B[aa_i_B][0] | |
aa_i_A += 1 | |
aa_i_B += 1 | |
return mapping | |
def parse_structure(filepath): | |
"""Parse a PDB/cif structure.""" | |
try: | |
filepath.resolve(strict=True) | |
except FileNotFoundError: | |
raise FileNotFoundError(f"File not found: {filepath}") | |
if filepath.suffix in {".pdb", ".ent"}: | |
parser = PDBParser() | |
elif filepath.suffix in {".cif", ".mmcif"}: | |
parser = FastMMCIFParser() | |
else: | |
raise ValueError( | |
f"Unsupported input structure format: {filepath.suffix}" | |
) | |
return parser.get_structure(filepath.name, str(filepath)) | |
if __name__ == "__main__": | |
ap = argparse.ArgumentParser(description=__doc__) | |
ap.add_argument( | |
"refe", | |
type=pathlib.Path, | |
help="Reference Structure", | |
) | |
ap.add_argument( | |
"mobi", | |
type=pathlib.Path, | |
help="Mobile Structure", | |
) | |
ap.add_argument( | |
"--r_chain", | |
default="A", | |
help="Reference Structure Chain", | |
) | |
ap.add_argument( | |
"--m_chain", | |
default="A", | |
help="Mobile Structure Chain", | |
) | |
ap.add_argument( | |
"-o", | |
"--output", | |
type=pathlib.Path, | |
default=None, | |
help="Name of output aligned structure.", | |
) | |
args = ap.parse_args() | |
# Parse structures & take only the necessary chain | |
s_reference = parse_structure(args.refe) | |
try: | |
reference = s_reference[0][args.r_chain] | |
except KeyError: | |
raise Exception(f"Chain {args.r_chain} not found in reference.") | |
s_mobile = parse_structure(args.mobi) | |
try: | |
mobile = s_mobile[0][args.m_chain] | |
except KeyError: | |
raise Exception(f"Chain {args.m_chain} not found in mobile.") | |
# Align sequences to get mapping between residues | |
mapping = align_sequences(reference, mobile) | |
refe_ca_list, mobi_ca_list = [], [] | |
for refe_res in mapping: | |
refe_ca_list.append(reference[refe_res]["CA"]) | |
mobi_ca_list.append(mobile[mapping[refe_res]]["CA"]) | |
# Superimpose matching residues | |
si = Superimposer() | |
si.set_atoms(refe_ca_list, mobi_ca_list) | |
si.apply(mobile.get_atoms()) | |
print(f"RMSD between structures: {si.rms:4.2f}") | |
# Write aligned mobile | |
if args.output is None: | |
ext = args.mobi.suffix | |
args.output = args.mobi.with_suffix(f".aligned{ext}") | |
if args.output.suffix in {".cif", ".mmcif"}: | |
io = MMCIFIO() | |
elif args.output.suffix in {".pdb", ".ent"}: | |
io = PDBIO() | |
else: | |
print("Output format not recognized. Defaulting to mmcIF.") | |
io = MMCIFIO() | |
io.set_structure(mobile) | |
io.save(str(args.output)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you, this is great! I was just about to write this (but googling is faster:)
I think part of the code in main() could be extracted so that there is also an align(mobile, ref) function.
Then this file could be used a python module as well.