Skip to content

Instantly share code, notes, and snippets.

@sczizzo
Created March 13, 2013 23:07
Show Gist options
  • Save sczizzo/5157303 to your computer and use it in GitHub Desktop.
Save sczizzo/5157303 to your computer and use it in GitHub Desktop.
Ugly PDB processing utilities for Python
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
from __future__ import division
import numpy as np
import math
def flatten(l):
out = []
for item in l:
if isinstance(item, (list, tuple)):
out.extend(flatten(item))
else:
out.append(item)
return out
##############################################################################
############################# MAJOR DEFS ###################################
##############################################################################
# Return the position vector for a given atom
def atom_pos(atom):
return np.array([atom['x'], atom['y'], atom['z']])
# Find the straight-line distance between any two atoms
def atom_distance(a, b):
return np.linalg.norm(atom_pos(a) - atom_pos(b))
# Find the angle between any three atoms (in radians)
# The span from a1 to a2 should be the hypotenuse
def atom_angle(a1, a2, a3):
a = atom_distance(a2, a3)
b = atom_distance(a1, a3)
c = atom_distance(a1, a2)
return math.acos((a**2 + b**2 - c**2) / (2*a*b))
# Find all atoms with a given name in a residue
def find_atoms(name, residue):
return [a for a in residue['atoms'] if a['atomName'] == name]
# Find the alpha carbon in a residue
def alpha_carbon(residue):
cas = find_atoms("CA", residue)
if len(cas) > 0: return cas[0]
# Determine whether the given atoms consitute a H-bond
# A the acceptor, B the antecdent, H the hydrogen, and D the donor
def is_hbond(a, b, h, d):
half_pi = math.pi / 2
try:
if atom_distance(d, a) >= 3.5: return False
if atom_distance(h, a) >= 2.5: return False
if atom_angle(d, a, h) <= half_pi: return False
if atom_angle(d, b, h) <= half_pi: return False
if atom_angle(h, b, a) <= half_pi: return False
except: return False
return True
# Find the straight-line distance between two atoms in a residue
def residue_distance(residue, a1_name, a2_name):
a1s = find_atoms(a1_name, residue)
a2s = find_atoms(a2_name, residue)
if len(a1s) < 1 \
or len(a2s) < 1: return
a1, a2 = a1s[0], a2s[0]
return atom_distance(a1, a2)
# Calculate the volume of a tetrahedron defined by the atoms in the residue
def residue_volume(residue, a1_name, a2_name, a3_name, a4_name):
a1s = find_atoms(a1_name, residue)
a2s = find_atoms(a2_name, residue)
a3s = find_atoms(a3_name, residue)
a4s = find_atoms(a4_name, residue)
if len(a1s) != 1 \
or len(a2s) != 1 \
or len(a3s) != 1 \
or len(a4s) != 1: return
a = atom_pos(a1s[0])
b = atom_pos(a2s[0])
c = atom_pos(a3s[0])
d = atom_pos(a4s[0])
t1 = a - d
t2a = b - d
t2b = c - d
t2 = np.cross(t2a, t2b)
t = np.cross(t1, t2)
return np.linalg.norm(t) / 6
# Extract raw records from a PDB file
def extract_records(pdb_fname):
pdb_file = open(pdb_fname, "r")
records = pdb_file.readlines()
pdb_file.close()
return records
# Extract all residues from a PDB file
def extract_residues(pdb_fname):
return build_residues(extract_records(pdb_fname))
# Extract all models from a PDB file
def extract_models(pdb_fname):
rs = extract_residues(pdb_fname)
mns = uniq(map(lambda r: r['model'], rs))
models = []
for mn in mns:
models.append({
'model': mn,
'residues': filter(lambda r: r['model'] == mn, rs)
})
return models
##############################################################################
############################ UTILITY DEFS ##################################
##############################################################################
# Uniquify a list
def uniq(ls):
uls = []
for l in ls:
if l not in uls: uls.append(l)
return uls
# Decide whether a raw PDB record is a heteroatom
def is_heteroatom(record):
return len(record) >= 14 and record[0:6] == 'HETATM'
# Decide whether a raw PDB record is an atom
def is_atom(record):
return len(record) >= 14 and record[0:4] == 'ATOM'
# Build an atom from a raw PDB record
def build_atom(record):
if not is_atom(record): return
return {
'recordType': record[ 0:6].strip(),
'atomSerial': int(record[ 6:11].strip()),
'atomName': record[12:16].strip(),
'altLoc': record[16:17].strip(),
'resName': record[17:20].strip(),
'chainID': record[21:22].strip(),
'resSeq': int(record[22:26].strip()),
'iCode': record[26:27].strip(),
'x': float(record[30:38].strip()),
'y': float(record[38:46].strip()),
'z': float(record[46:54].strip()),
'occupancy': float(record[54:60].strip()),
'tempFactor': float(record[60:66].strip()),
'element': record[76:78].strip(),
'charge': record[78:80].strip()
}
# Decide whether a raw PDB record is a model
def is_model(record):
return len(record) >= 14 and record[0:5] == 'MODEL'
# Extract the full list of atoms from a PDB file
def build_atoms(records):
model, atoms = 0, []
for record in records:
if is_model(record):
model = int(record[10:14].strip())
elif is_atom(record):
atom = build_atom(record)
atom['model'] = model
atoms.append(atom)
atoms.sort(key=lambda a: a['atomSerial'])
return atoms
# Decide whether a raw PDB record is a helix
def is_helix(record):
return len(record) >= 8 and record[0:5] == 'HELIX'
# Build a helix from a raw PDB record
def build_helix(record):
if not is_helix(record): return
return {
"recordType": record[ 0:5].strip(),
"helixNumbr": int(record[ 6:10].strip()),
"helixName": record[11:14].strip(),
"altLocBeg": record[ 14].strip(),
"resNameBeg": record[15:18].strip(),
"chainIDBeg": record[ 19].strip(),
"resSeqBeg": int(record[20:25].strip()),
"iCodeBeg": record[ 25].strip(),
"altLocEnd": record[ 26].strip(),
"resNameEnd": record[27:30].strip(),
"chainIDEnd": record[ 31].strip(),
"resSeqEnd": int(record[32:37].strip()),
"iCodeEnd": record[ 37].strip(),
"SheetType": record[38:40].strip(),
}
# Extract the full list of helices from a PDB file
def build_helices(records):
helices = []
for record in records:
helix = build_helix(record)
if helix: helices.append(helix)
return uniq(helices)
# Decide whether a raw PDB record is a sheet
def is_sheet(record):
return len(record) >= 8 and record[0:5] == 'SHEET'
# Build a sheet from a raw PDB record
def build_sheet(record):
if not is_sheet(record): return
return {
'recordType': record[ 0:5].strip(),
'sheetNumbr': int(record[ 6:10].strip()),
'sheetName': record[11:16].strip(),
'altLocBeg': record[ 16].strip(),
'resNameBeg': record[17:20].strip(),
'chainIDBeg': record[ 21].strip(),
'resSeqBeg': int(record[22:26].strip()),
'iCodeBeg': record[ 26].strip(),
'altLocEnd': record[ 27].strip(),
'resNameEnd': record[28:31].strip(),
'chainIDEnd': record[ 32].strip(),
'resSeqEnd': int(record[33:37].strip()),
'iCodeEnd': record[ 37].strip(),
'SheetType': record[38:40].strip(),
}
# Extract the full list of sheets from a PDB file
def build_sheets(records):
sheets = []
for record in records:
sheet = build_sheet(record)
if sheet: sheets.append(sheet)
return uniq(sheets)
# Calculate the multiplier for an iCode
def icode_mult(icode):
if icode == "": return 0
if icode == "A": return 1
if icode == "B": return 2
if icode == "C": return 3
if icode == "D": return 4
if icode == "E": return 5
if icode == "F": return 6
if icode == "G": return 7
if icode == "H": return 8
if icode == "I": return 9
if icode == "J": return 10
if icode == "X": return 11
return 12
# Calculate the residue number given a sequence number and icode
def res_num(res_seq, icode):
return res_seq + 0.1 * icode_mult(icode)
# Decide whether a record is a part of a helix
def in_helix(num, chain_id, helices):
for ss in helices:
ss_num_beg = res_num(ss['resSeqBeg'], ss['iCodeBeg'])
ss_num_end = res_num(ss['resSeqEnd'], ss['iCodeEnd'])
if chain_id == ss['chainIDBeg'] \
and num >= ss_num_beg \
and num <= ss_num_end: return True
return False
# Decide whether a record is part of a sheet
def in_sheet(num, chain_id, sheets):
for ss in sheets:
ss_num_beg = res_num(ss["resSeqBeg"], ss["iCodeBeg"])
ss_num_end = res_num(ss["resSeqEnd"], ss["iCodeEnd"])
if chain_id == ss["chainIDBeg"] \
and num >= ss_num_beg \
and num <= ss_num_end: return True
return False
# Return the secondary structure code for a residue
def ss_code(num, chain_id, helices, sheets):
in_a_helix = in_helix(num, chain_id, helices)
in_a_sheet = in_sheet(num, chain_id, sheets)
if in_a_helix and in_a_sheet: return 'unknown'
if in_a_helix: return 'helix'
if in_a_sheet: return 'sheet'
return 'loop'
# Build a residue from raw PDB information
def build_residue(atoms, helices, sheets):
model = atoms[0]['model']
resName = atoms[0]['resName']
chainID = atoms[0]['chainID']
resSeq = atoms[0]['resSeq']
iCode = atoms[0]['iCode']
resNum = res_num(resSeq, iCode)
ssCode = ss_code(resNum, chainID, helices, sheets)
return {
'model': model,
'resNum': resNum,
'resName': resName,
'chainID': chainID,
'resSeq': resSeq,
'iCode': iCode,
'ssCode': ssCode,
'atoms': atoms
}
# Decide whether an atom represents an alternate location
def is_alternate(atom):
return '' != atom['altLoc'] and 'A' != atom['altLoc']
# Extract the full list of residues from a PDB file
# N.B. Discards alternate locations
def build_residues(records):
helices = build_helices(records)
sheets = build_sheets(records)
residues = []
last_resSeq = None
last_atoms = None
for atom in build_atoms(records):
if last_resSeq == atom['resSeq']:
if not is_alternate(atom):
last_atoms.append(atom)
else:
if last_atoms != None:
residue = build_residue(last_atoms, helices, sheets)
residues.append(residue)
last_atoms = [atom]
last_resSeq = atom['resSeq']
residues.sort(key=lambda r: r['resNum'])
return residues
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment