Created
March 13, 2013 23:07
-
-
Save sczizzo/5157303 to your computer and use it in GitHub Desktop.
Ugly PDB processing utilities for Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
from __future__ import division | |
import numpy as np | |
import math | |
def flatten(l): | |
out = [] | |
for item in l: | |
if isinstance(item, (list, tuple)): | |
out.extend(flatten(item)) | |
else: | |
out.append(item) | |
return out | |
############################################################################## | |
############################# MAJOR DEFS ################################### | |
############################################################################## | |
# Return the position vector for a given atom | |
def atom_pos(atom): | |
return np.array([atom['x'], atom['y'], atom['z']]) | |
# Find the straight-line distance between any two atoms | |
def atom_distance(a, b): | |
return np.linalg.norm(atom_pos(a) - atom_pos(b)) | |
# Find the angle between any three atoms (in radians) | |
# The span from a1 to a2 should be the hypotenuse | |
def atom_angle(a1, a2, a3): | |
a = atom_distance(a2, a3) | |
b = atom_distance(a1, a3) | |
c = atom_distance(a1, a2) | |
return math.acos((a**2 + b**2 - c**2) / (2*a*b)) | |
# Find all atoms with a given name in a residue | |
def find_atoms(name, residue): | |
return [a for a in residue['atoms'] if a['atomName'] == name] | |
# Find the alpha carbon in a residue | |
def alpha_carbon(residue): | |
cas = find_atoms("CA", residue) | |
if len(cas) > 0: return cas[0] | |
# Determine whether the given atoms consitute a H-bond | |
# A the acceptor, B the antecdent, H the hydrogen, and D the donor | |
def is_hbond(a, b, h, d): | |
half_pi = math.pi / 2 | |
try: | |
if atom_distance(d, a) >= 3.5: return False | |
if atom_distance(h, a) >= 2.5: return False | |
if atom_angle(d, a, h) <= half_pi: return False | |
if atom_angle(d, b, h) <= half_pi: return False | |
if atom_angle(h, b, a) <= half_pi: return False | |
except: return False | |
return True | |
# Find the straight-line distance between two atoms in a residue | |
def residue_distance(residue, a1_name, a2_name): | |
a1s = find_atoms(a1_name, residue) | |
a2s = find_atoms(a2_name, residue) | |
if len(a1s) < 1 \ | |
or len(a2s) < 1: return | |
a1, a2 = a1s[0], a2s[0] | |
return atom_distance(a1, a2) | |
# Calculate the volume of a tetrahedron defined by the atoms in the residue | |
def residue_volume(residue, a1_name, a2_name, a3_name, a4_name): | |
a1s = find_atoms(a1_name, residue) | |
a2s = find_atoms(a2_name, residue) | |
a3s = find_atoms(a3_name, residue) | |
a4s = find_atoms(a4_name, residue) | |
if len(a1s) != 1 \ | |
or len(a2s) != 1 \ | |
or len(a3s) != 1 \ | |
or len(a4s) != 1: return | |
a = atom_pos(a1s[0]) | |
b = atom_pos(a2s[0]) | |
c = atom_pos(a3s[0]) | |
d = atom_pos(a4s[0]) | |
t1 = a - d | |
t2a = b - d | |
t2b = c - d | |
t2 = np.cross(t2a, t2b) | |
t = np.cross(t1, t2) | |
return np.linalg.norm(t) / 6 | |
# Extract raw records from a PDB file | |
def extract_records(pdb_fname): | |
pdb_file = open(pdb_fname, "r") | |
records = pdb_file.readlines() | |
pdb_file.close() | |
return records | |
# Extract all residues from a PDB file | |
def extract_residues(pdb_fname): | |
return build_residues(extract_records(pdb_fname)) | |
# Extract all models from a PDB file | |
def extract_models(pdb_fname): | |
rs = extract_residues(pdb_fname) | |
mns = uniq(map(lambda r: r['model'], rs)) | |
models = [] | |
for mn in mns: | |
models.append({ | |
'model': mn, | |
'residues': filter(lambda r: r['model'] == mn, rs) | |
}) | |
return models | |
############################################################################## | |
############################ UTILITY DEFS ################################## | |
############################################################################## | |
# Uniquify a list | |
def uniq(ls): | |
uls = [] | |
for l in ls: | |
if l not in uls: uls.append(l) | |
return uls | |
# Decide whether a raw PDB record is a heteroatom | |
def is_heteroatom(record): | |
return len(record) >= 14 and record[0:6] == 'HETATM' | |
# Decide whether a raw PDB record is an atom | |
def is_atom(record): | |
return len(record) >= 14 and record[0:4] == 'ATOM' | |
# Build an atom from a raw PDB record | |
def build_atom(record): | |
if not is_atom(record): return | |
return { | |
'recordType': record[ 0:6].strip(), | |
'atomSerial': int(record[ 6:11].strip()), | |
'atomName': record[12:16].strip(), | |
'altLoc': record[16:17].strip(), | |
'resName': record[17:20].strip(), | |
'chainID': record[21:22].strip(), | |
'resSeq': int(record[22:26].strip()), | |
'iCode': record[26:27].strip(), | |
'x': float(record[30:38].strip()), | |
'y': float(record[38:46].strip()), | |
'z': float(record[46:54].strip()), | |
'occupancy': float(record[54:60].strip()), | |
'tempFactor': float(record[60:66].strip()), | |
'element': record[76:78].strip(), | |
'charge': record[78:80].strip() | |
} | |
# Decide whether a raw PDB record is a model | |
def is_model(record): | |
return len(record) >= 14 and record[0:5] == 'MODEL' | |
# Extract the full list of atoms from a PDB file | |
def build_atoms(records): | |
model, atoms = 0, [] | |
for record in records: | |
if is_model(record): | |
model = int(record[10:14].strip()) | |
elif is_atom(record): | |
atom = build_atom(record) | |
atom['model'] = model | |
atoms.append(atom) | |
atoms.sort(key=lambda a: a['atomSerial']) | |
return atoms | |
# Decide whether a raw PDB record is a helix | |
def is_helix(record): | |
return len(record) >= 8 and record[0:5] == 'HELIX' | |
# Build a helix from a raw PDB record | |
def build_helix(record): | |
if not is_helix(record): return | |
return { | |
"recordType": record[ 0:5].strip(), | |
"helixNumbr": int(record[ 6:10].strip()), | |
"helixName": record[11:14].strip(), | |
"altLocBeg": record[ 14].strip(), | |
"resNameBeg": record[15:18].strip(), | |
"chainIDBeg": record[ 19].strip(), | |
"resSeqBeg": int(record[20:25].strip()), | |
"iCodeBeg": record[ 25].strip(), | |
"altLocEnd": record[ 26].strip(), | |
"resNameEnd": record[27:30].strip(), | |
"chainIDEnd": record[ 31].strip(), | |
"resSeqEnd": int(record[32:37].strip()), | |
"iCodeEnd": record[ 37].strip(), | |
"SheetType": record[38:40].strip(), | |
} | |
# Extract the full list of helices from a PDB file | |
def build_helices(records): | |
helices = [] | |
for record in records: | |
helix = build_helix(record) | |
if helix: helices.append(helix) | |
return uniq(helices) | |
# Decide whether a raw PDB record is a sheet | |
def is_sheet(record): | |
return len(record) >= 8 and record[0:5] == 'SHEET' | |
# Build a sheet from a raw PDB record | |
def build_sheet(record): | |
if not is_sheet(record): return | |
return { | |
'recordType': record[ 0:5].strip(), | |
'sheetNumbr': int(record[ 6:10].strip()), | |
'sheetName': record[11:16].strip(), | |
'altLocBeg': record[ 16].strip(), | |
'resNameBeg': record[17:20].strip(), | |
'chainIDBeg': record[ 21].strip(), | |
'resSeqBeg': int(record[22:26].strip()), | |
'iCodeBeg': record[ 26].strip(), | |
'altLocEnd': record[ 27].strip(), | |
'resNameEnd': record[28:31].strip(), | |
'chainIDEnd': record[ 32].strip(), | |
'resSeqEnd': int(record[33:37].strip()), | |
'iCodeEnd': record[ 37].strip(), | |
'SheetType': record[38:40].strip(), | |
} | |
# Extract the full list of sheets from a PDB file | |
def build_sheets(records): | |
sheets = [] | |
for record in records: | |
sheet = build_sheet(record) | |
if sheet: sheets.append(sheet) | |
return uniq(sheets) | |
# Calculate the multiplier for an iCode | |
def icode_mult(icode): | |
if icode == "": return 0 | |
if icode == "A": return 1 | |
if icode == "B": return 2 | |
if icode == "C": return 3 | |
if icode == "D": return 4 | |
if icode == "E": return 5 | |
if icode == "F": return 6 | |
if icode == "G": return 7 | |
if icode == "H": return 8 | |
if icode == "I": return 9 | |
if icode == "J": return 10 | |
if icode == "X": return 11 | |
return 12 | |
# Calculate the residue number given a sequence number and icode | |
def res_num(res_seq, icode): | |
return res_seq + 0.1 * icode_mult(icode) | |
# Decide whether a record is a part of a helix | |
def in_helix(num, chain_id, helices): | |
for ss in helices: | |
ss_num_beg = res_num(ss['resSeqBeg'], ss['iCodeBeg']) | |
ss_num_end = res_num(ss['resSeqEnd'], ss['iCodeEnd']) | |
if chain_id == ss['chainIDBeg'] \ | |
and num >= ss_num_beg \ | |
and num <= ss_num_end: return True | |
return False | |
# Decide whether a record is part of a sheet | |
def in_sheet(num, chain_id, sheets): | |
for ss in sheets: | |
ss_num_beg = res_num(ss["resSeqBeg"], ss["iCodeBeg"]) | |
ss_num_end = res_num(ss["resSeqEnd"], ss["iCodeEnd"]) | |
if chain_id == ss["chainIDBeg"] \ | |
and num >= ss_num_beg \ | |
and num <= ss_num_end: return True | |
return False | |
# Return the secondary structure code for a residue | |
def ss_code(num, chain_id, helices, sheets): | |
in_a_helix = in_helix(num, chain_id, helices) | |
in_a_sheet = in_sheet(num, chain_id, sheets) | |
if in_a_helix and in_a_sheet: return 'unknown' | |
if in_a_helix: return 'helix' | |
if in_a_sheet: return 'sheet' | |
return 'loop' | |
# Build a residue from raw PDB information | |
def build_residue(atoms, helices, sheets): | |
model = atoms[0]['model'] | |
resName = atoms[0]['resName'] | |
chainID = atoms[0]['chainID'] | |
resSeq = atoms[0]['resSeq'] | |
iCode = atoms[0]['iCode'] | |
resNum = res_num(resSeq, iCode) | |
ssCode = ss_code(resNum, chainID, helices, sheets) | |
return { | |
'model': model, | |
'resNum': resNum, | |
'resName': resName, | |
'chainID': chainID, | |
'resSeq': resSeq, | |
'iCode': iCode, | |
'ssCode': ssCode, | |
'atoms': atoms | |
} | |
# Decide whether an atom represents an alternate location | |
def is_alternate(atom): | |
return '' != atom['altLoc'] and 'A' != atom['altLoc'] | |
# Extract the full list of residues from a PDB file | |
# N.B. Discards alternate locations | |
def build_residues(records): | |
helices = build_helices(records) | |
sheets = build_sheets(records) | |
residues = [] | |
last_resSeq = None | |
last_atoms = None | |
for atom in build_atoms(records): | |
if last_resSeq == atom['resSeq']: | |
if not is_alternate(atom): | |
last_atoms.append(atom) | |
else: | |
if last_atoms != None: | |
residue = build_residue(last_atoms, helices, sheets) | |
residues.append(residue) | |
last_atoms = [atom] | |
last_resSeq = atom['resSeq'] | |
residues.sort(key=lambda r: r['resNum']) | |
return residues |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment