bryan-lunt · June 30, 2010 18:03
diff --git a/mapseqres.py b/mapseqres.py
 '''
 Created on Jun 29, 2010

 @author: [email protected]
 '''

 from Bio.Seq import Seq
 import Bio.PDB as PDB
 import Bio.PDB.Polypeptide as PP
 from itertools import izip, count

 def get_seqres_from_structfile(structfile):
 	"""Expects an opened handle to the pdbXXXX.ent file to be read
 	Returns a dict of ChainID -> Bio.Seq.Seq objects representing the seqres entries of the pdb file.
 	"""
 	lines = list()
 	#get all SEQRES entries
 	for line in structfile:
 		if line.startswith("SEQRES"):
 			lines.append(line)
 	
 	threeDict = dict()
 	for line in lines:
 		chainID = line[11]
 		ress = line[19:]
 		resList = threeDict.setdefault(chainID,list())
 		resList.extend(ress.split())
 	
 	#TODO Infer for each sequences what its alphabet should be.
 	retdict = dict()
 	for key,value in threeDict.iteritems():
 		try:
 			mySeq = Seq("".join([PP.to_one_letter_code[i] for i in value]))
 			retdict[key] = mySeq
 		except:
 			pass
 	return retdict

 def get_chainSeqs_from_structure(structure):
 	"""Expects a Bio.PDB.Structure
 	Returns a dictionary mapping chain IDs -> lists of polypeptides for that chain
 	"""
 	myBuilder = PP.PPBuilder()
 	retval = dict([(aChain.id,myBuilder.build_peptides(aChain)) for aChain in structure.get_chains()])
 	return retval

 def mapSeqResToATOM(seqres,polypeplist):
 	"""Returns a list of  (singleLetter, <residue> or None) tuples for every residue in the seqres, mapping seqres position to a residue in the model (via provided polypeptide list)
 	"""
 	#we could guard against overlap here by specifying the subsection to find in, but is that necessary?
 	offsetList = [(seqres.find(polypep.get_sequence().tostring()),polypep) for polypep in polypeplist]
 	retlist = [(i,None) for i in seqres]
 	
 	#This is much easier to read and cleaner than the old version
 	#BUT, it assigns areas of memory twice, not the most super-efficient code, but who cares?
 	for startPos, polypep in offsetList:
 		end=startPos+len(polypep)
 		retlist[startPos:end] = zip(seqres[startPos:end],polypep)
 	return retlist

 def mapAll(structfile,structure):
 	"""Returns a map of chainID -> list of (singleLetter, <residue> or None) tuples for every residue in the seqres
 	"""
 	seqRes = get_seqres_from_structfile(structfile)
 	chainSeqs = get_chainSeqs_from_structure(structure)

 	retDict = dict()
 	for key in seqRes.keys():
 		if not key in chainSeqs.keys():
 			continue
 		retDict[key] = mapSeqResToATOM(seqRes[key],chainSeqs[key])
 	return retDict


 if __name__ == "__main__":
 	#do some tests
 	plist = PDB.PDBList()
 	modelfile = plist.retrieve_pdb_file("1CO0",pdir="/tmp/foobar/")
 	aparser = PDB.PDBParser()
 	structure = aparser.get_structure("X", modelfile)
 	foomaps = mapAll(open(modelfile),structure)
	'''
	Created on Jun 29, 2010

	@author: [email protected]
	'''

	from Bio.Seq import Seq
	import Bio.PDB as PDB
	import Bio.PDB.Polypeptide as PP
	from itertools import izip, count

	def get_seqres_from_structfile(structfile):
	"""Expects an opened handle to the pdbXXXX.ent file to be read
	Returns a dict of ChainID -> Bio.Seq.Seq objects representing the seqres entries of the pdb file.
	"""
	lines = list()
	#get all SEQRES entries
	for line in structfile:
	if line.startswith("SEQRES"):
	lines.append(line)

	threeDict = dict()
	for line in lines:
	chainID = line[11]
	ress = line[19:]
	resList = threeDict.setdefault(chainID,list())
	resList.extend(ress.split())

	#TODO Infer for each sequences what its alphabet should be.
	retdict = dict()
	for key,value in threeDict.iteritems():
	try:
	mySeq = Seq("".join([PP.to_one_letter_code[i] for i in value]))
	retdict[key] = mySeq
	except:
	pass
	return retdict

	def get_chainSeqs_from_structure(structure):
	"""Expects a Bio.PDB.Structure
	Returns a dictionary mapping chain IDs -> lists of polypeptides for that chain
	"""
	myBuilder = PP.PPBuilder()
	retval = dict([(aChain.id,myBuilder.build_peptides(aChain)) for aChain in structure.get_chains()])
	return retval

	def mapSeqResToATOM(seqres,polypeplist):
	"""Returns a list of (singleLetter, <residue> or None) tuples for every residue in the seqres, mapping seqres position to a residue in the model (via provided polypeptide list)
	"""
	#we could guard against overlap here by specifying the subsection to find in, but is that necessary?
	offsetList = [(seqres.find(polypep.get_sequence().tostring()),polypep) for polypep in polypeplist]
	retlist = [(i,None) for i in seqres]

	#This is much easier to read and cleaner than the old version
	#BUT, it assigns areas of memory twice, not the most super-efficient code, but who cares?
	for startPos, polypep in offsetList:
	end=startPos+len(polypep)
	retlist[startPos:end] = zip(seqres[startPos:end],polypep)
	return retlist

	def mapAll(structfile,structure):
	"""Returns a map of chainID -> list of (singleLetter, <residue> or None) tuples for every residue in the seqres
	"""
	seqRes = get_seqres_from_structfile(structfile)
	chainSeqs = get_chainSeqs_from_structure(structure)

	retDict = dict()
	for key in seqRes.keys():
	if not key in chainSeqs.keys():
	continue
	retDict[key] = mapSeqResToATOM(seqRes[key],chainSeqs[key])
	return retDict


	if __name__ == "__main__":
	#do some tests
	plist = PDB.PDBList()
	modelfile = plist.retrieve_pdb_file("1CO0",pdir="/tmp/foobar/")
	aparser = PDB.PDBParser()
	structure = aparser.get_structure("X", modelfile)
	foomaps = mapAll(open(modelfile),structure)