sczizzo · March 13, 2013 23:07
diff --git a/pdb.py b/pdb.py
 #!/usr/bin/env python
 # -*- coding: UTF-8 -*-
 from __future__ import division
 import numpy as np
 import math

 def flatten(l):
  out = []
  for item in l:
    if isinstance(item, (list, tuple)):
      out.extend(flatten(item))
    else:
      out.append(item)
  return out


 ##############################################################################
 #############################  MAJOR DEFS  ###################################
 ##############################################################################

 # Return the position vector for a given atom
 def atom_pos(atom):
  return np.array([atom['x'], atom['y'], atom['z']])

 # Find the straight-line distance between any two atoms
 def atom_distance(a, b):
  return np.linalg.norm(atom_pos(a) - atom_pos(b))

 # Find the angle between any three atoms (in radians)
 # The span from a1 to a2 should be the hypotenuse
 def atom_angle(a1, a2, a3):
  a = atom_distance(a2, a3)
  b = atom_distance(a1, a3)
  c = atom_distance(a1, a2)
  return math.acos((a**2 + b**2 - c**2) / (2*a*b))

 # Find all atoms with a given name in a residue
 def find_atoms(name, residue):
  return [a for a in residue['atoms'] if a['atomName'] == name]

 # Find the alpha carbon in a residue
 def alpha_carbon(residue):
  cas = find_atoms("CA", residue)
  if len(cas) > 0: return cas[0]

 # Determine whether the given atoms consitute a H-bond
 # A the acceptor, B the antecdent, H the hydrogen, and D the donor
 def is_hbond(a, b, h, d):
  half_pi = math.pi / 2
  try:
    if atom_distance(d, a) >= 3.5: return False
    if atom_distance(h, a) >= 2.5: return False
    if atom_angle(d, a, h) <= half_pi: return False
    if atom_angle(d, b, h) <= half_pi: return False
    if atom_angle(h, b, a) <= half_pi: return False
  except: return False
  return True

 # Find the straight-line distance between two atoms in a residue
 def residue_distance(residue, a1_name, a2_name):
  a1s = find_atoms(a1_name, residue)
  a2s = find_atoms(a2_name, residue)
  if len(a1s) < 1 \
  or len(a2s) < 1: return
  a1, a2 = a1s[0], a2s[0]
  return atom_distance(a1, a2)

 # Calculate the volume of a tetrahedron defined by the atoms in the residue
 def residue_volume(residue, a1_name, a2_name, a3_name, a4_name):
  a1s = find_atoms(a1_name, residue)
  a2s = find_atoms(a2_name, residue)
  a3s = find_atoms(a3_name, residue)
  a4s = find_atoms(a4_name, residue)
  if len(a1s) != 1 \
  or len(a2s) != 1 \
  or len(a3s) != 1 \
  or len(a4s) != 1: return
  a   = atom_pos(a1s[0])
  b   = atom_pos(a2s[0])
  c   = atom_pos(a3s[0])
  d   = atom_pos(a4s[0])
  t1  = a - d
  t2a = b - d
  t2b = c - d
  t2  = np.cross(t2a, t2b)
  t   = np.cross(t1, t2)
  return np.linalg.norm(t) / 6

 # Extract raw records from a PDB file
 def extract_records(pdb_fname):
  pdb_file = open(pdb_fname, "r")
  records  = pdb_file.readlines()
  pdb_file.close()
  return records

 # Extract all residues from a PDB file
 def extract_residues(pdb_fname):
  return build_residues(extract_records(pdb_fname))

 # Extract all models from a PDB file
 def extract_models(pdb_fname):
  rs     = extract_residues(pdb_fname)
  mns    = uniq(map(lambda r: r['model'], rs))
  models = []
  for mn in mns:
    models.append({
      'model': mn,
      'residues': filter(lambda r: r['model'] == mn, rs)
    })
  return models



 ##############################################################################
 ############################  UTILITY DEFS  ##################################
 ##############################################################################

 # Uniquify a list
 def uniq(ls):
  uls = []
  for l in ls:
    if l not in uls: uls.append(l)
  return uls

 # Decide whether a raw PDB record is a heteroatom
 def is_heteroatom(record):
  return len(record) >= 14 and record[0:6] == 'HETATM'

 # Decide whether a raw PDB record is an atom
 def is_atom(record):
  return len(record) >= 14 and record[0:4] == 'ATOM'

 # Build an atom from a raw PDB record
 def build_atom(record):
  if not is_atom(record): return
  return {
    'recordType':       record[  0:6].strip(),
    'atomSerial':   int(record[ 6:11].strip()),
    'atomName':         record[12:16].strip(),
    'altLoc':           record[16:17].strip(),
    'resName':          record[17:20].strip(),
    'chainID':          record[21:22].strip(),
    'resSeq':       int(record[22:26].strip()),
    'iCode':            record[26:27].strip(),
    'x':          float(record[30:38].strip()),
    'y':          float(record[38:46].strip()),
    'z':          float(record[46:54].strip()),
    'occupancy':  float(record[54:60].strip()),
    'tempFactor': float(record[60:66].strip()),
    'element':          record[76:78].strip(),
    'charge':           record[78:80].strip()
  }

 # Decide whether a raw PDB record is a model
 def is_model(record):
  return len(record) >= 14 and record[0:5] == 'MODEL'

 # Extract the full list of atoms from a PDB file
 def build_atoms(records):
  model, atoms = 0, []
  for record in records:
    if is_model(record):
      model = int(record[10:14].strip())
    elif is_atom(record):
      atom = build_atom(record)
      atom['model'] = model
      atoms.append(atom)
  atoms.sort(key=lambda a: a['atomSerial'])
  return atoms

 # Decide whether a raw PDB record is a helix
 def is_helix(record):
  return len(record) >= 8 and record[0:5] == 'HELIX'

 # Build a helix from a raw PDB record
 def build_helix(record):
  if not is_helix(record): return
  return {
    "recordType":     record[  0:5].strip(),
    "helixNumbr": int(record[ 6:10].strip()),
    "helixName":      record[11:14].strip(),
    "altLocBeg":      record[   14].strip(),
    "resNameBeg":     record[15:18].strip(),
    "chainIDBeg":     record[   19].strip(),
    "resSeqBeg":  int(record[20:25].strip()),
    "iCodeBeg":       record[   25].strip(),
    "altLocEnd":      record[   26].strip(),
    "resNameEnd":     record[27:30].strip(),
    "chainIDEnd":     record[   31].strip(),
    "resSeqEnd":  int(record[32:37].strip()),
    "iCodeEnd":       record[   37].strip(),
    "SheetType":      record[38:40].strip(),
  }

 # Extract the full list of helices from a PDB file
 def build_helices(records):
  helices = []
  for record in records:
    helix = build_helix(record)
    if helix: helices.append(helix)
  return uniq(helices)

 # Decide whether a raw PDB record is a sheet
 def is_sheet(record):
  return len(record) >= 8 and record[0:5] == 'SHEET'

 # Build a sheet from a raw PDB record
 def build_sheet(record):
  if not is_sheet(record): return
  return {
    'recordType':     record[  0:5].strip(),
    'sheetNumbr': int(record[ 6:10].strip()),
    'sheetName':      record[11:16].strip(),
    'altLocBeg':      record[   16].strip(),
    'resNameBeg':     record[17:20].strip(),
    'chainIDBeg':     record[   21].strip(),
    'resSeqBeg':  int(record[22:26].strip()),
    'iCodeBeg':       record[   26].strip(),
    'altLocEnd':      record[   27].strip(),
    'resNameEnd':     record[28:31].strip(),
    'chainIDEnd':     record[   32].strip(),
    'resSeqEnd':  int(record[33:37].strip()),
    'iCodeEnd':       record[   37].strip(),
    'SheetType':      record[38:40].strip(),
  }

 # Extract the full list of sheets from a PDB file
 def build_sheets(records):
  sheets = []
  for record in records:
    sheet = build_sheet(record)
    if sheet: sheets.append(sheet)
  return uniq(sheets)

 # Calculate the multiplier for an iCode
 def icode_mult(icode):
  if icode ==  "": return 0
  if icode == "A": return 1
  if icode == "B": return 2
  if icode == "C": return 3
  if icode == "D": return 4
  if icode == "E": return 5
  if icode == "F": return 6
  if icode == "G": return 7
  if icode == "H": return 8
  if icode == "I": return 9
  if icode == "J": return 10
  if icode == "X": return 11
  return 12

 # Calculate the residue number given a sequence number and icode
 def res_num(res_seq, icode):
  return res_seq + 0.1 * icode_mult(icode)

 # Decide whether a record is a part of a helix
 def in_helix(num, chain_id, helices):
  for ss in helices:
    ss_num_beg = res_num(ss['resSeqBeg'], ss['iCodeBeg'])
    ss_num_end = res_num(ss['resSeqEnd'], ss['iCodeEnd'])
    if chain_id == ss['chainIDBeg'] \
    and num >= ss_num_beg \
    and num <= ss_num_end: return True
  return False

 # Decide whether a record is part of a sheet
 def in_sheet(num, chain_id, sheets):
  for ss in sheets:
    ss_num_beg = res_num(ss["resSeqBeg"], ss["iCodeBeg"])
    ss_num_end = res_num(ss["resSeqEnd"], ss["iCodeEnd"])
    if chain_id == ss["chainIDBeg"] \
    and num >= ss_num_beg \
    and num <= ss_num_end: return True
  return False

 # Return the secondary structure code for a residue
 def ss_code(num, chain_id, helices, sheets):
  in_a_helix = in_helix(num, chain_id, helices)
  in_a_sheet = in_sheet(num, chain_id, sheets)
  if in_a_helix and in_a_sheet: return 'unknown'
  if in_a_helix: return 'helix'
  if in_a_sheet: return 'sheet'
  return 'loop'

 # Build a residue from raw PDB information
 def build_residue(atoms, helices, sheets):
  model   = atoms[0]['model']
  resName = atoms[0]['resName']
  chainID = atoms[0]['chainID']
  resSeq  = atoms[0]['resSeq']
  iCode   = atoms[0]['iCode']
  resNum  = res_num(resSeq, iCode)
  ssCode  = ss_code(resNum, chainID, helices, sheets)
  return {
    'model':   model,
    'resNum':  resNum,
    'resName': resName,
    'chainID': chainID,
    'resSeq':  resSeq,
    'iCode':   iCode,
    'ssCode':  ssCode,
    'atoms':   atoms
  }

 # Decide whether an atom represents an alternate location
 def is_alternate(atom):
  return '' != atom['altLoc'] and 'A' != atom['altLoc']

 # Extract the full list of residues from a PDB file
 # N.B. Discards alternate locations
 def build_residues(records):
  helices     = build_helices(records)
  sheets      = build_sheets(records)
  residues    = []
  last_resSeq = None
  last_atoms  = None
  for atom in build_atoms(records):
    if last_resSeq == atom['resSeq']:
      if not is_alternate(atom):
        last_atoms.append(atom)
    else:
      if last_atoms != None:
        residue = build_residue(last_atoms, helices, sheets)
        residues.append(residue)
      last_atoms = [atom]
    last_resSeq = atom['resSeq']
  residues.sort(key=lambda r: r['resNum'])
  return residues
	#!/usr/bin/env python
	# -- coding: UTF-8 --
	from __future__ import division
	import numpy as np
	import math

	def flatten(l):
	out = []
	for item in l:
	if isinstance(item, (list, tuple)):
	out.extend(flatten(item))
	else:
	out.append(item)
	return out


	##############################################################################
	############################# MAJOR DEFS ###################################
	##############################################################################

	# Return the position vector for a given atom
	def atom_pos(atom):
	return np.array([atom['x'], atom['y'], atom['z']])

	# Find the straight-line distance between any two atoms
	def atom_distance(a, b):
	return np.linalg.norm(atom_pos(a) - atom_pos(b))

	# Find the angle between any three atoms (in radians)
	# The span from a1 to a2 should be the hypotenuse
	def atom_angle(a1, a2, a3):
	a = atom_distance(a2, a3)
	b = atom_distance(a1, a3)
	c = atom_distance(a1, a2)
	return math.acos((a2 + b2 - c*2) / (2a*b))

	# Find all atoms with a given name in a residue
	def find_atoms(name, residue):
	return [a for a in residue['atoms'] if a['atomName'] == name]

	# Find the alpha carbon in a residue
	def alpha_carbon(residue):
	cas = find_atoms("CA", residue)
	if len(cas) > 0: return cas[0]

	# Determine whether the given atoms consitute a H-bond
	# A the acceptor, B the antecdent, H the hydrogen, and D the donor
	def is_hbond(a, b, h, d):
	half_pi = math.pi / 2
	try:
	if atom_distance(d, a) >= 3.5: return False
	if atom_distance(h, a) >= 2.5: return False
	if atom_angle(d, a, h) <= half_pi: return False
	if atom_angle(d, b, h) <= half_pi: return False
	if atom_angle(h, b, a) <= half_pi: return False
	except: return False
	return True

	# Find the straight-line distance between two atoms in a residue
	def residue_distance(residue, a1_name, a2_name):
	a1s = find_atoms(a1_name, residue)
	a2s = find_atoms(a2_name, residue)
	if len(a1s) < 1 \
	or len(a2s) < 1: return
	a1, a2 = a1s[0], a2s[0]
	return atom_distance(a1, a2)

	# Calculate the volume of a tetrahedron defined by the atoms in the residue
	def residue_volume(residue, a1_name, a2_name, a3_name, a4_name):
	a1s = find_atoms(a1_name, residue)
	a2s = find_atoms(a2_name, residue)
	a3s = find_atoms(a3_name, residue)
	a4s = find_atoms(a4_name, residue)
	if len(a1s) != 1 \
	or len(a2s) != 1 \
	or len(a3s) != 1 \
	or len(a4s) != 1: return
	a = atom_pos(a1s[0])
	b = atom_pos(a2s[0])
	c = atom_pos(a3s[0])
	d = atom_pos(a4s[0])
	t1 = a - d
	t2a = b - d
	t2b = c - d
	t2 = np.cross(t2a, t2b)
	t = np.cross(t1, t2)
	return np.linalg.norm(t) / 6

	# Extract raw records from a PDB file
	def extract_records(pdb_fname):
	pdb_file = open(pdb_fname, "r")
	records = pdb_file.readlines()
	pdb_file.close()
	return records

	# Extract all residues from a PDB file
	def extract_residues(pdb_fname):
	return build_residues(extract_records(pdb_fname))

	# Extract all models from a PDB file
	def extract_models(pdb_fname):
	rs = extract_residues(pdb_fname)
	mns = uniq(map(lambda r: r['model'], rs))
	models = []
	for mn in mns:
	models.append({
	'model': mn,
	'residues': filter(lambda r: r['model'] == mn, rs)
	})
	return models



	##############################################################################
	############################ UTILITY DEFS ##################################
	##############################################################################

	# Uniquify a list
	def uniq(ls):
	uls = []
	for l in ls:
	if l not in uls: uls.append(l)
	return uls

	# Decide whether a raw PDB record is a heteroatom
	def is_heteroatom(record):
	return len(record) >= 14 and record[0:6] == 'HETATM'

	# Decide whether a raw PDB record is an atom
	def is_atom(record):
	return len(record) >= 14 and record[0:4] == 'ATOM'

	# Build an atom from a raw PDB record
	def build_atom(record):
	if not is_atom(record): return
	return {
	'recordType': record[ 0:6].strip(),
	'atomSerial': int(record[ 6:11].strip()),
	'atomName': record[12:16].strip(),
	'altLoc': record[16:17].strip(),
	'resName': record[17:20].strip(),
	'chainID': record[21:22].strip(),
	'resSeq': int(record[22:26].strip()),
	'iCode': record[26:27].strip(),
	'x': float(record[30:38].strip()),
	'y': float(record[38:46].strip()),
	'z': float(record[46:54].strip()),
	'occupancy': float(record[54:60].strip()),
	'tempFactor': float(record[60:66].strip()),
	'element': record[76:78].strip(),
	'charge': record[78:80].strip()
	}

	# Decide whether a raw PDB record is a model
	def is_model(record):
	return len(record) >= 14 and record[0:5] == 'MODEL'

	# Extract the full list of atoms from a PDB file
	def build_atoms(records):
	model, atoms = 0, []
	for record in records:
	if is_model(record):
	model = int(record[10:14].strip())
	elif is_atom(record):
	atom = build_atom(record)
	atom['model'] = model
	atoms.append(atom)
	atoms.sort(key=lambda a: a['atomSerial'])
	return atoms

	# Decide whether a raw PDB record is a helix
	def is_helix(record):
	return len(record) >= 8 and record[0:5] == 'HELIX'

	# Build a helix from a raw PDB record
	def build_helix(record):
	if not is_helix(record): return
	return {
	"recordType": record[ 0:5].strip(),
	"helixNumbr": int(record[ 6:10].strip()),
	"helixName": record[11:14].strip(),
	"altLocBeg": record[ 14].strip(),
	"resNameBeg": record[15:18].strip(),
	"chainIDBeg": record[ 19].strip(),
	"resSeqBeg": int(record[20:25].strip()),
	"iCodeBeg": record[ 25].strip(),
	"altLocEnd": record[ 26].strip(),
	"resNameEnd": record[27:30].strip(),
	"chainIDEnd": record[ 31].strip(),
	"resSeqEnd": int(record[32:37].strip()),
	"iCodeEnd": record[ 37].strip(),
	"SheetType": record[38:40].strip(),
	}

	# Extract the full list of helices from a PDB file
	def build_helices(records):
	helices = []
	for record in records:
	helix = build_helix(record)
	if helix: helices.append(helix)
	return uniq(helices)

	# Decide whether a raw PDB record is a sheet
	def is_sheet(record):
	return len(record) >= 8 and record[0:5] == 'SHEET'

	# Build a sheet from a raw PDB record
	def build_sheet(record):
	if not is_sheet(record): return
	return {
	'recordType': record[ 0:5].strip(),
	'sheetNumbr': int(record[ 6:10].strip()),
	'sheetName': record[11:16].strip(),
	'altLocBeg': record[ 16].strip(),
	'resNameBeg': record[17:20].strip(),
	'chainIDBeg': record[ 21].strip(),
	'resSeqBeg': int(record[22:26].strip()),
	'iCodeBeg': record[ 26].strip(),
	'altLocEnd': record[ 27].strip(),
	'resNameEnd': record[28:31].strip(),
	'chainIDEnd': record[ 32].strip(),
	'resSeqEnd': int(record[33:37].strip()),
	'iCodeEnd': record[ 37].strip(),
	'SheetType': record[38:40].strip(),
	}

	# Extract the full list of sheets from a PDB file
	def build_sheets(records):
	sheets = []
	for record in records:
	sheet = build_sheet(record)
	if sheet: sheets.append(sheet)
	return uniq(sheets)

	# Calculate the multiplier for an iCode
	def icode_mult(icode):
	if icode == "": return 0
	if icode == "A": return 1
	if icode == "B": return 2
	if icode == "C": return 3
	if icode == "D": return 4
	if icode == "E": return 5
	if icode == "F": return 6
	if icode == "G": return 7
	if icode == "H": return 8
	if icode == "I": return 9
	if icode == "J": return 10
	if icode == "X": return 11
	return 12

	# Calculate the residue number given a sequence number and icode
	def res_num(res_seq, icode):
	return res_seq + 0.1 * icode_mult(icode)

	# Decide whether a record is a part of a helix
	def in_helix(num, chain_id, helices):
	for ss in helices:
	ss_num_beg = res_num(ss['resSeqBeg'], ss['iCodeBeg'])
	ss_num_end = res_num(ss['resSeqEnd'], ss['iCodeEnd'])
	if chain_id == ss['chainIDBeg'] \
	and num >= ss_num_beg \
	and num <= ss_num_end: return True
	return False

	# Decide whether a record is part of a sheet
	def in_sheet(num, chain_id, sheets):
	for ss in sheets:
	ss_num_beg = res_num(ss["resSeqBeg"], ss["iCodeBeg"])
	ss_num_end = res_num(ss["resSeqEnd"], ss["iCodeEnd"])
	if chain_id == ss["chainIDBeg"] \
	and num >= ss_num_beg \
	and num <= ss_num_end: return True
	return False

	# Return the secondary structure code for a residue
	def ss_code(num, chain_id, helices, sheets):
	in_a_helix = in_helix(num, chain_id, helices)
	in_a_sheet = in_sheet(num, chain_id, sheets)
	if in_a_helix and in_a_sheet: return 'unknown'
	if in_a_helix: return 'helix'
	if in_a_sheet: return 'sheet'
	return 'loop'

	# Build a residue from raw PDB information
	def build_residue(atoms, helices, sheets):
	model = atoms[0]['model']
	resName = atoms[0]['resName']
	chainID = atoms[0]['chainID']
	resSeq = atoms[0]['resSeq']
	iCode = atoms[0]['iCode']
	resNum = res_num(resSeq, iCode)
	ssCode = ss_code(resNum, chainID, helices, sheets)
	return {
	'model': model,
	'resNum': resNum,
	'resName': resName,
	'chainID': chainID,
	'resSeq': resSeq,
	'iCode': iCode,
	'ssCode': ssCode,
	'atoms': atoms
	}

	# Decide whether an atom represents an alternate location
	def is_alternate(atom):
	return '' != atom['altLoc'] and 'A' != atom['altLoc']

	# Extract the full list of residues from a PDB file
	# N.B. Discards alternate locations
	def build_residues(records):
	helices = build_helices(records)
	sheets = build_sheets(records)
	residues = []
	last_resSeq = None
	last_atoms = None
	for atom in build_atoms(records):
	if last_resSeq == atom['resSeq']:
	if not is_alternate(atom):
	last_atoms.append(atom)
	else:
	if last_atoms != None:
	residue = build_residue(last_atoms, helices, sheets)
	residues.append(residue)
	last_atoms = [atom]
	last_resSeq = atom['resSeq']
	residues.sort(key=lambda r: r['resNum'])
	return residues