joezuntz · October 12, 2015 12:38
diff --git a/extract_deliverables.py b/extract_deliverables.py
 import glob
 import re
 import pyparsing as pp
 import collections


 def parse_projects(text):
    #the list which will contain the located projects
    projects = []

    #this enables parsing something with nested brackets 
    #just finding the outermost thing
    SB = pp.nestedExpr(opener='[', closer=']')
    RB = pp.nestedExpr(opener='{', closer='}')

    #The latex patterns we are looking for.
    project_pattern = r"\keyproject" + SB + RB + RB
    deliverable_pattern = r"\deliverable" + SB + RB + RB
    keytask_pattern = r"\keytask" + SB + RB + RB
    prereq_pattern = r"\prereq" + pp.nestedExpr(opener='{', closer='}')
 # \prereq{\deliverableref{TJP2-DC1-SW1}, \deliverableref{CI5}}


    #tell each pattern to append to the project list when
    #a project is found. Because we pass the same list object 
    #to all of them they can all read/write from/to the same one
    project_pattern.setParseAction(ProjectParser(projects))
    deliverable_pattern.setParseAction(DeliverableParser(projects))
    keytask_pattern.setParseAction(KeyTaskParser(projects))
    prereq_pattern.setParseAction(PreReqParser(projects))

    #the overall search pattern is to look for any of the
    #three patterns defined above
    search_pattern = pp.Or([project_pattern, deliverable_pattern, keytask_pattern, 
        prereq_pattern])

    #scan the text.
    #we have to use a loop because otherwise it returns a lazy generator
    #that doesn't actually start work
    for _ in search_pattern.scanString(text):
        pass

    #return our list of projects
    return projects

 class ElementParser(object):
    def __init__(self, projects):
        self.projects = projects

    def parse(self, content):
        # All the elements we are parsing here have the same
        #signature, fortunately.
        code = match_to_string(content[1])
        date = match_to_string(content[2])
        name = match_to_string(content[3])
        info = {}
        info['name'] = name
        info['code'] = code
        info['date'] = date

        return info

 class ProjectParser(ElementParser):
    def __call__(self, s, loc, content):
        info = self.parse(content)
        info['deliverables'] = []
        self.projects.append(info)

 class DeliverableParser(ElementParser):
    def __call__(self, s, loc, content):
        info = self.parse(content)
        info['keytasks'] = []
        info['prereqs'] = []
        current_project = self.projects[-1]
        current_project['deliverables'].append(info)

 class KeyTaskParser(ElementParser):
    def __call__(self, s, loc, content):
        info = self.parse(content)
        current_project = self.projects[-1]
        current_deliverable = current_project['deliverables'][-1]
        current_deliverable['keytasks'].append(info)

 class PreReqParser(object):
    def __init__(self, projects):
        self.projects = projects
        RB = pp.nestedExpr(opener='{', closer='}')
        self.projectref_pattern = r"\keyprojectref" + RB
        self.deliverableref_pattern = r"\deliverableref" + RB

    def __call__(self, s, loc, content):
        content = content.asList()
        content = content[1]
        print '-'*30
        print content
        print '-'*30
        if not content:
            return

        current_project = self.projects[-1]
        current_deliverable = current_project['deliverables'][-1]
        prereqs = current_deliverable['prereqs']

        i = 0
        while True:
            # print content[i], type(content[i])
            if isinstance(content[i], str) and content[i].strip(',').strip()==r'\deliverableref':
                for req in content[i+1]:
                    if req==',': continue
                    prereqs.append('deliverable:'+req)
                i+=2
            elif isinstance(content[i], str) and content[i].strip(',').strip()==r'\keyprojectref':
                for req in content[i+1]:
                    if req==',': continue
                    prereqs.append('keyproject:'+req)
                i+=2
            else:
                i+=1
            if i>=len(content):
                break
        print
        print "project {0} reqs:{1}".format(current_project['name'], prereqs)
        print
        print

 def find_included_files(base):
    #find all files that have been \include'd in our base file
    include = re.compile(r'\\include\{([a-zA-z0-9]+)\}', re.VERBOSE|re.MULTILINE)
    text = read_latex(base)
    return include.findall(text)


 def read_latex(filename):
    #read a file and remove latex-commented lines
    text = '\n'.join(line for line in open(filename).readlines() if not line.strip().startswith('%'))
    return text

 def match_to_string(m):
    #flatten a list of matches into a space-separated string.
    if isinstance(m, str):
        return m
    return ' '.join(match_to_string(mi) for mi in m)


 def dump_projects(groups):
    for filename, projects in groups.items():
        if not projects: continue
        print filename.upper()
        print "="*max(len(filename), 10)
        print
        for proj in projects:
            print "+ {0[name]}  [{0[code]}]".format(proj)
            for dv in proj['deliverables']:
                print '    -  {0[name]}  [{0[code]}]'.format(dv)
                for kt in dv['keytasks']:
                    print '        + {0[name]}  [{0[code]}]'.format(kt)
            print
        print
        print
        print


 def main():
    #find all the files included in the base "srm.tex" file
    filenames = find_included_files("srm.tex")
    groups = {}
    for filename in filenames:
        text = read_latex(filename+".tex")
        projects = parse_projects(text)
        if projects:
            groups[filename] = projects

    # dump_projects(groups)

    #Some other possible outputs:
    import json
    print json.dumps(groups)

    # import yaml
    # print yaml.dump(groups)

 if __name__ == '__main__':
    main()
	import glob
	import re
	import pyparsing as pp
	import collections


	def parse_projects(text):
	#the list which will contain the located projects
	projects = []

	#this enables parsing something with nested brackets
	#just finding the outermost thing
	SB = pp.nestedExpr(opener='[', closer=']')
	RB = pp.nestedExpr(opener='{', closer='}')

	#The latex patterns we are looking for.
	project_pattern = r"\keyproject" + SB + RB + RB
	deliverable_pattern = r"\deliverable" + SB + RB + RB
	keytask_pattern = r"\keytask" + SB + RB + RB
	prereq_pattern = r"\prereq" + pp.nestedExpr(opener='{', closer='}')
	# \prereq{\deliverableref{TJP2-DC1-SW1}, \deliverableref{CI5}}


	#tell each pattern to append to the project list when
	#a project is found. Because we pass the same list object
	#to all of them they can all read/write from/to the same one
	project_pattern.setParseAction(ProjectParser(projects))
	deliverable_pattern.setParseAction(DeliverableParser(projects))
	keytask_pattern.setParseAction(KeyTaskParser(projects))
	prereq_pattern.setParseAction(PreReqParser(projects))

	#the overall search pattern is to look for any of the
	#three patterns defined above
	search_pattern = pp.Or([project_pattern, deliverable_pattern, keytask_pattern,
	prereq_pattern])

	#scan the text.
	#we have to use a loop because otherwise it returns a lazy generator
	#that doesn't actually start work
	for _ in search_pattern.scanString(text):
	pass

	#return our list of projects
	return projects

	class ElementParser(object):
	def __init__(self, projects):
	self.projects = projects

	def parse(self, content):
	# All the elements we are parsing here have the same
	#signature, fortunately.
	code = match_to_string(content[1])
	date = match_to_string(content[2])
	name = match_to_string(content[3])
	info = {}
	info['name'] = name
	info['code'] = code
	info['date'] = date

	return info

	class ProjectParser(ElementParser):
	def __call__(self, s, loc, content):
	info = self.parse(content)
	info['deliverables'] = []
	self.projects.append(info)

	class DeliverableParser(ElementParser):
	def __call__(self, s, loc, content):
	info = self.parse(content)
	info['keytasks'] = []
	info['prereqs'] = []
	current_project = self.projects[-1]
	current_project['deliverables'].append(info)

	class KeyTaskParser(ElementParser):
	def __call__(self, s, loc, content):
	info = self.parse(content)
	current_project = self.projects[-1]
	current_deliverable = current_project['deliverables'][-1]
	current_deliverable['keytasks'].append(info)

	class PreReqParser(object):
	def __init__(self, projects):
	self.projects = projects
	RB = pp.nestedExpr(opener='{', closer='}')
	self.projectref_pattern = r"\keyprojectref" + RB
	self.deliverableref_pattern = r"\deliverableref" + RB

	def __call__(self, s, loc, content):
	content = content.asList()
	content = content[1]
	print '-'*30
	print content
	print '-'*30
	if not content:
	return

	current_project = self.projects[-1]
	current_deliverable = current_project['deliverables'][-1]
	prereqs = current_deliverable['prereqs']

	i = 0
	while True:
	# print content[i], type(content[i])
	if isinstance(content[i], str) and content[i].strip(',').strip()==r'\deliverableref':
	for req in content[i+1]:
	if req==',': continue
	prereqs.append('deliverable:'+req)
	i+=2
	elif isinstance(content[i], str) and content[i].strip(',').strip()==r'\keyprojectref':
	for req in content[i+1]:
	if req==',': continue
	prereqs.append('keyproject:'+req)
	i+=2
	else:
	i+=1
	if i>=len(content):
	break
	print
	print "project {0} reqs:{1}".format(current_project['name'], prereqs)
	print
	print

	def find_included_files(base):
	#find all files that have been \include'd in our base file
	include = re.compile(r'\\include\{([a-zA-z0-9]+)\}', re.VERBOSE\|re.MULTILINE)
	text = read_latex(base)
	return include.findall(text)


	def read_latex(filename):
	#read a file and remove latex-commented lines
	text = '\n'.join(line for line in open(filename).readlines() if not line.strip().startswith('%'))
	return text

	def match_to_string(m):
	#flatten a list of matches into a space-separated string.
	if isinstance(m, str):
	return m
	return ' '.join(match_to_string(mi) for mi in m)


	def dump_projects(groups):
	for filename, projects in groups.items():
	if not projects: continue
	print filename.upper()
	print "="*max(len(filename), 10)
	print
	for proj in projects:
	print "+ {0[name]} [{0[code]}]".format(proj)
	for dv in proj['deliverables']:
	print ' - {0[name]} [{0[code]}]'.format(dv)
	for kt in dv['keytasks']:
	print ' + {0[name]} [{0[code]}]'.format(kt)
	print
	print
	print
	print


	def main():
	#find all the files included in the base "srm.tex" file
	filenames = find_included_files("srm.tex")
	groups = {}
	for filename in filenames:
	text = read_latex(filename+".tex")
	projects = parse_projects(text)
	if projects:
	groups[filename] = projects

	# dump_projects(groups)

	#Some other possible outputs:
	import json
	print json.dumps(groups)

	# import yaml
	# print yaml.dump(groups)

	if __name__ == '__main__':
	main()