Created
September 2, 2022 21:03
-
-
Save edraizen/14733e65d6142126dd21c816e44bd50f to your computer and use it in GitHub Desktop.
Calculated PDB contacts and outputs a similar form to protein contact atlas
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pathlib import Path | |
import click | |
import numpy as np | |
import pandas as pd | |
from biotite.structure.io.pdb import PDBFile | |
from sklearn.metrics import pairwise_distances | |
@click.command() | |
@click.argument('pdb_file', required=1, type=click.Path(exists=True)) | |
@click.option('--chain', default=None, help='Only use specific chain in PDB file else use all') | |
@click.option('--out_file', default=None, help='File to output contact else use stdout') | |
@click.option('--max_cutoff', default=5, type=float, help='Distance cutoff to use') | |
def calculate_contacts(pdb_file, chain=None, out_file=None, max_cutoff=5): | |
pdb_file = Path(pdb_file) | |
pdb_name = pdb_file.stem | |
if out_file is None: | |
out_file = pdb_file.with_suffix(".pdb.contacts") | |
atom_array = PDBFile.read(str(pdb_file)).get_structure(model=1, altloc='first', extra_fields=['atom_id', 'b_factor', 'occupancy']) | |
atom_array = atom_array[atom_array.element!="H"] | |
if chain is not None: | |
atom_array = atom_array[atom_array.chain_id==chain] | |
dist = pairwise_distances(atom_array.coord) | |
dist[np.tril_indices(len(dist), k=0, m=None)] = np.nan | |
dist2 = pd.DataFrame( | |
dist, | |
index=pd.MultiIndex.from_tuples( | |
zip(atom_array.atom_id, atom_array.chain_id, atom_array.res_name, atom_array.res_id, atom_array.ins_code, atom_array.atom_name), | |
name=["atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name"]), | |
columns=pd.MultiIndex.from_tuples( | |
zip(atom_array.atom_id, atom_array.chain_id, atom_array.res_name, atom_array.res_id, atom_array.ins_code, atom_array.atom_name), | |
name=["atom1_index", "atom1_chain_id", "atom1_res_name", "atom1_res_id", "atom1_ins_code", "atom1_atom_name"]), | |
) | |
dist3 = dist2.reset_index().melt( | |
id_vars=["atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name"], | |
value_name="distance").dropna() | |
dist3 = dist3[["atom1_index", "atom1_chain_id", "atom1_res_name", "atom1_res_id", "atom1_ins_code", "atom1_atom_name", | |
"atom2_index", "atom2_chain_id", "atom2_res_name", "atom2_res_id", "atom2_ins_code", "atom2_atom_name", "distance"]] | |
dist3 = dist3.sort_values(["atom1_chain_id", "atom1_res_id", "atom1_ins_code", "atom2_chain_id", "atom2_res_id", "atom2_ins_code"]) | |
close = dist3[ | |
(dist3.distance<max_cutoff)& | |
(dist3.distance>0)& | |
(dist3.atom1_res_id.astype(str)+dist3.atom1_ins_code!=dist3.atom2_res_id.astype(str)+dist3.atom2_ins_code)] | |
with out_file.open("w") as f: | |
print("PDB", "Chain1", "Chain2", "Res1", "ResNum1", "Res2", "ResNum2", "Number of atomic contacts", "Atoms", "Chain Types", "Distance", sep="\t", file=f) | |
for res1, group in close.groupby(["atom1_chain_id", "atom1_res_id", "atom1_ins_code", "atom2_chain_id", "atom2_res_id", "atom2_ins_code"]): | |
for row in group.itertuples(): | |
atom1_type = "M" if row.atom1_atom_name in ["C", "N", "O", "CA"] else "S" | |
atom2_type = "M" if row.atom2_atom_name in ["C", "N", "O", "CA"] else "S" | |
print( | |
pdb_name, | |
row.atom1_chain_id, | |
row.atom2_chain_id, | |
row.atom1_res_name, | |
f"{row.atom1_res_id}{row.atom1_ins_code}", | |
row.atom2_res_name, | |
f"{row.atom2_res_id}{row.atom2_ins_code}", | |
len(group), | |
row.atom1_atom_name+"-"+row.atom2_atom_name, | |
f"{atom1_type}-{atom2_type}", | |
f"{row.distance:0.4f}", sep="\t", file=f) | |
if __name__ == "__main__": | |
calculate_contacts() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
click | |
numpy | |
pandas | |
biotite | |
sklearn |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment