JoaoRodrigues · October 29, 2014 00:27
diff --git a/sampling_analysis.py b/sampling_analysis.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*
 """
 Utility script to analyze several docking poses of a protein complex.

 Produces PDB file with color-coded ligand center-of-mass positions
 that reflect the population density (i.e. number of neighbor models).

 Works with multiple chain models by considering as "ligand" all chains 
 defined as NOT part of the receptor. Default, chain A is the only receptor.

 João Rodrigues @ Utrecht, 2014
 """

 import copy
 import os
 import sys

 try:
    from Bio.PDB import PDBParser
    from Bio.PDB import PDBIO
    from Bio.PDB import Superimposer
    
    from Bio.PDB import StructureBuilder

    from Bio.PDB import NeighborSearch
 except ImportError:
    raise Exception('Could not import Biopython PDB modules')

 # Main Code
 if __name__ == '__main__':

    import argparse
    
    parser = argparse.ArgumentParser(description='Analyse sampling of docking runs.')
    # Define args
    parser.add_argument('structures', metavar='pdb_file(s)', type=str, nargs='+',
                        help='Docking models containing all chains')
    # Define options
    parser.add_argument('--output', metavar='output.pdb', type=str,
                        default='sampling.pdb',
                        help='Name of the output analysis PDB file')
    parser.add_argument('--receptor', metavar='A', type=str, nargs='+',
                        default='A',
                        help='Chains to consider as receptor')
    parser.add_argument('--radius', metavar='N Å', type=float,
                        default=10.0,
                        help='Radius in Å to consider neighboring ligand COMs')

    ui = parser.parse_args() # user input

    pdb_parser = PDBParser(QUIET=1, PERMISSIVE=1)
        
    # Start analysis
    print "= Starting ..."

    lst_models = []
    for pdb_model in ui.structures:
        fpath = os.path.abspath(pdb_model)
        if not fpath.endswith('.pdb'):
            print >>sys.stderr, "'{0}' is not a PDB file. Ignoring...".format(fpath)
            continue
            
        lst_models.append(fpath)

    print "\t Read {0} models".format(len(lst_models))    

    # Fit on receptor chains of the first model
    # Select only CA or P atoms
    print "= Extracting reference atoms from first model"

    receptor_chains = set(ui.receptor)
    print "\tUsing chains as receptor: {0}".format(','.join(receptor_chains))


    refe_atoms = []
    refe_path = lst_models[0]
    refe_model = pdb_parser.get_structure(os.path.basename(refe_path), refe_path)
    for residue in refe_model.get_residues():
        if residue.parent.id in receptor_chains:
            if residue.id[0] == ' ' and 'CA' in residue:
                refe_atoms.append(residue['CA'])
            elif residue.id[0] == ' ' and 'P' in residue:
                refe_atoms.append(residue['P'])
                
    if not refe_atoms:
        msg = '\tNo "CA" or "P" atoms found in first model for the specified chain(s)'
        print >>sys.stderr, msg
        sys.exit(1)

    # Create 'sampling' structure with receptor chains of first model
    sb = StructureBuilder.StructureBuilder()
    sb.init_structure('sampling')
    sb.init_model(0)
    sb.init_seg('')
    for chain in refe_model.get_chains():
        if chain.id in receptor_chains:
            sb.model.add(chain)

    # Free space?
    del refe_model
    
    # Fit to reference
    si = Superimposer()
    
    print "= Fitting the remaining models"
    ligand_atoms = []
    for model_path in lst_models:
        model = pdb_parser.get_structure(os.path.basename(model_path), model_path)
        mobi_atoms = []
        ligand_atoms.append([])
        for residue in model.get_residues():
            if residue.parent.id in receptor_chains:
                if residue.id[0] == ' ' and 'CA' in residue:
                    mobi_atoms.append(residue['CA'])
                elif residue.id[0] == ' ' and 'P' in residue:
                    mobi_atoms.append(residue['P'])
            else: # collect ligand atoms
                for atom in residue:
                    ligand_atoms[-1].append(atom)

        if not mobi_atoms:
            msg = '\tNo "CA" or "P" atoms found in "{0}" for the specified chain(s). Skipping.'
            print >>sys.stderr, msg.format(model.id)
            continue

        si.set_atoms(refe_atoms, mobi_atoms)
        si.apply(model.get_atoms())

        # Free Space?
        del model
    
    # C. Calculate rotated ligand center-of-mass
    # D. Append COM as a residue of chain X for the sampling.pdb structure.
    print "= Calculating ligand center-of-mass"
    sb.init_chain('X')
    ligand_coms = []
    modifier = 0 # id modifier for >9999 models
    for imodel, atom_lst in enumerate(ligand_atoms, start=1):
        coord_lst = [a.coord for a in atom_lst]
        ligand_com = sum(coord_lst)/len(coord_lst)
        # Create 'virtual' atom with COM coordinates
        # Show as oxygen since red color is nice to visualize.
        # Avoid numbering issues..
        # Im leaving this like this.. who wants to analyze more than 20k positions..
        if imodel >= 9999:
            modifier = -9998
            sb.init_chain('Y')

        sb.init_residue('COM', ' ', imodel+modifier, ' ')
        sb.init_atom('XYZ', ligand_com, 0.0, 0.0, ' ', 'XYZ', 1, 'O')
        ligand_coms.append(sb.atom)

    # Analyze COM distances
    print "= Analyzing COM distances"
    nb = NeighborSearch(ligand_coms)
    nneighbors = []
    for com in ligand_coms:
        nn = nb.search(com.coord, ui.radius, level='R')
        # minimum 1 (itself)
        nneighbors.append(len(nn))
    
    # Normalize neighbor counts
    max_nn = float(max(nneighbors))
    min_nn = min(nneighbors)
    nneighbors = [(n-min_nn)/max_nn for n in nneighbors]
    print '\tLarger Cluster: {0} elements'.format(int(max_nn))
    
    # Add to bfactor
    for com, nn in zip(ligand_coms, nneighbors):
        com.bfactor = nn

    # Write sampling.pdb
    print "= Writing analysis file to: {0}".format(ui.output)
    io = PDBIO()
    output_structure = sb.get_structure()
    io.set_structure(output_structure)
    io.save(ui.output)

    print "= Writing Pymol visualization file"
    with open('sampling.pml', 'w') as handle:
        print >>handle, 'color white'
        print >>handle, 'spectrum b, rainbow, chain X+Y'
        print >>handle, 'as spheres, chain X+Y'
        print >>handle, 'as surface, not chain X+Y'
        print >>handle, 'center not chain X+Y'
        print >>handle, 'zoom'
    
    print "= End"
	#!/usr/bin/env python
	# -- coding: utf-8 -
	"""
	Utility script to analyze several docking poses of a protein complex.

	Produces PDB file with color-coded ligand center-of-mass positions
	that reflect the population density (i.e. number of neighbor models).

	Works with multiple chain models by considering as "ligand" all chains
	defined as NOT part of the receptor. Default, chain A is the only receptor.

	João Rodrigues @ Utrecht, 2014
	"""

	import copy
	import os
	import sys

	try:
	from Bio.PDB import PDBParser
	from Bio.PDB import PDBIO
	from Bio.PDB import Superimposer

	from Bio.PDB import StructureBuilder

	from Bio.PDB import NeighborSearch
	except ImportError:
	raise Exception('Could not import Biopython PDB modules')

	# Main Code
	if __name__ == '__main__':

	import argparse

	parser = argparse.ArgumentParser(description='Analyse sampling of docking runs.')
	# Define args
	parser.add_argument('structures', metavar='pdb_file(s)', type=str, nargs='+',
	help='Docking models containing all chains')
	# Define options
	parser.add_argument('--output', metavar='output.pdb', type=str,
	default='sampling.pdb',
	help='Name of the output analysis PDB file')
	parser.add_argument('--receptor', metavar='A', type=str, nargs='+',
	default='A',
	help='Chains to consider as receptor')
	parser.add_argument('--radius', metavar='N Å', type=float,
	default=10.0,
	help='Radius in Å to consider neighboring ligand COMs')

	ui = parser.parse_args() # user input

	pdb_parser = PDBParser(QUIET=1, PERMISSIVE=1)

	# Start analysis
	print "= Starting ..."

	lst_models = []
	for pdb_model in ui.structures:
	fpath = os.path.abspath(pdb_model)
	if not fpath.endswith('.pdb'):
	print >>sys.stderr, "'{0}' is not a PDB file. Ignoring...".format(fpath)
	continue

	lst_models.append(fpath)

	print "\t Read {0} models".format(len(lst_models))

	# Fit on receptor chains of the first model
	# Select only CA or P atoms
	print "= Extracting reference atoms from first model"

	receptor_chains = set(ui.receptor)
	print "\tUsing chains as receptor: {0}".format(','.join(receptor_chains))


	refe_atoms = []
	refe_path = lst_models[0]
	refe_model = pdb_parser.get_structure(os.path.basename(refe_path), refe_path)
	for residue in refe_model.get_residues():
	if residue.parent.id in receptor_chains:
	if residue.id[0] == ' ' and 'CA' in residue:
	refe_atoms.append(residue['CA'])
	elif residue.id[0] == ' ' and 'P' in residue:
	refe_atoms.append(residue['P'])

	if not refe_atoms:
	msg = '\tNo "CA" or "P" atoms found in first model for the specified chain(s)'
	print >>sys.stderr, msg
	sys.exit(1)

	# Create 'sampling' structure with receptor chains of first model
	sb = StructureBuilder.StructureBuilder()
	sb.init_structure('sampling')
	sb.init_model(0)
	sb.init_seg('')
	for chain in refe_model.get_chains():
	if chain.id in receptor_chains:
	sb.model.add(chain)

	# Free space?
	del refe_model

	# Fit to reference
	si = Superimposer()

	print "= Fitting the remaining models"
	ligand_atoms = []
	for model_path in lst_models:
	model = pdb_parser.get_structure(os.path.basename(model_path), model_path)
	mobi_atoms = []
	ligand_atoms.append([])
	for residue in model.get_residues():
	if residue.parent.id in receptor_chains:
	if residue.id[0] == ' ' and 'CA' in residue:
	mobi_atoms.append(residue['CA'])
	elif residue.id[0] == ' ' and 'P' in residue:
	mobi_atoms.append(residue['P'])
	else: # collect ligand atoms
	for atom in residue:
	ligand_atoms[-1].append(atom)

	if not mobi_atoms:
	msg = '\tNo "CA" or "P" atoms found in "{0}" for the specified chain(s). Skipping.'
	print >>sys.stderr, msg.format(model.id)
	continue

	si.set_atoms(refe_atoms, mobi_atoms)
	si.apply(model.get_atoms())

	# Free Space?
	del model

	# C. Calculate rotated ligand center-of-mass
	# D. Append COM as a residue of chain X for the sampling.pdb structure.
	print "= Calculating ligand center-of-mass"
	sb.init_chain('X')
	ligand_coms = []
	modifier = 0 # id modifier for >9999 models
	for imodel, atom_lst in enumerate(ligand_atoms, start=1):
	coord_lst = [a.coord for a in atom_lst]
	ligand_com = sum(coord_lst)/len(coord_lst)
	# Create 'virtual' atom with COM coordinates
	# Show as oxygen since red color is nice to visualize.
	# Avoid numbering issues..
	# Im leaving this like this.. who wants to analyze more than 20k positions..
	if imodel >= 9999:
	modifier = -9998
	sb.init_chain('Y')

	sb.init_residue('COM', ' ', imodel+modifier, ' ')
	sb.init_atom('XYZ', ligand_com, 0.0, 0.0, ' ', 'XYZ', 1, 'O')
	ligand_coms.append(sb.atom)

	# Analyze COM distances
	print "= Analyzing COM distances"
	nb = NeighborSearch(ligand_coms)
	nneighbors = []
	for com in ligand_coms:
	nn = nb.search(com.coord, ui.radius, level='R')
	# minimum 1 (itself)
	nneighbors.append(len(nn))

	# Normalize neighbor counts
	max_nn = float(max(nneighbors))
	min_nn = min(nneighbors)
	nneighbors = [(n-min_nn)/max_nn for n in nneighbors]
	print '\tLarger Cluster: {0} elements'.format(int(max_nn))

	# Add to bfactor
	for com, nn in zip(ligand_coms, nneighbors):
	com.bfactor = nn

	# Write sampling.pdb
	print "= Writing analysis file to: {0}".format(ui.output)
	io = PDBIO()
	output_structure = sb.get_structure()
	io.set_structure(output_structure)
	io.save(ui.output)

	print "= Writing Pymol visualization file"
	with open('sampling.pml', 'w') as handle:
	print >>handle, 'color white'
	print >>handle, 'spectrum b, rainbow, chain X+Y'
	print >>handle, 'as spheres, chain X+Y'
	print >>handle, 'as surface, not chain X+Y'
	print >>handle, 'center not chain X+Y'
	print >>handle, 'zoom'

	print "= End"