chulman444 · June 23, 2018 15:15
diff --git a/examine_docx_elements.py b/examine_docx_elements.py
 try:
 	from xml.etree.cElementTree import XML
 except ImportError:
 	from xml.etree.ElementTree import XML
 import zipfile, re


 """
 Module that extract text from MS XML Word document (.docx).
 (Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
 """

 WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
 PARA = WORD_NAMESPACE + 'p'
 TEXT = WORD_NAMESPACE + 't'


 def getAllTags(path):
 	"""
 	Take the path of a docx file as argument, return the text in unicode.
 	"""
 	document = zipfile.ZipFile(path)
 	xml_content = document.read('word/document.xml')
 	document.close()
 	tree = XML(xml_content)

 	all_tags = recursion(tree.getchildren())

 	return all_tags

 def recursion(children, output=[]):
 	for child in children:
 		output.append(child.tag)
 		sub_children = child.getchildren()
 		if len(sub_children) == 0:
 			pass
 		else:
 			recursion(sub_children, output)

 	return output

 import sys

 def run(docx_filepath):
 	all_tags = getAllTags(docx_filepath)
 	unique_tags = list(set(all_tags))
 	simplified_tags = removeNamespaces(unique_tags)
 	return simplified_tags	

 def main():
 	output = run(sys.argv[1])
 	print(output)

 def removeNamespaces(tags):
 	output = []
 	for tag in tags:
 		simple_tag = re.match(r'{.*}(.*)', tag)[1]
 		output.append(simple_tag)

 	return output

 if __name__=="__main__":
 	main()
	try:
	from xml.etree.cElementTree import XML
	except ImportError:
	from xml.etree.ElementTree import XML
	import zipfile, re


	"""
	Module that extract text from MS XML Word document (.docx).
	(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
	"""

	WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
	PARA = WORD_NAMESPACE + 'p'
	TEXT = WORD_NAMESPACE + 't'


	def getAllTags(path):
	"""
	Take the path of a docx file as argument, return the text in unicode.
	"""
	document = zipfile.ZipFile(path)
	xml_content = document.read('word/document.xml')
	document.close()
	tree = XML(xml_content)

	all_tags = recursion(tree.getchildren())

	return all_tags

	def recursion(children, output=[]):
	for child in children:
	output.append(child.tag)
	sub_children = child.getchildren()
	if len(sub_children) == 0:
	pass
	else:
	recursion(sub_children, output)

	return output

	import sys

	def run(docx_filepath):
	all_tags = getAllTags(docx_filepath)
	unique_tags = list(set(all_tags))
	simplified_tags = removeNamespaces(unique_tags)
	return simplified_tags

	def main():
	output = run(sys.argv[1])
	print(output)

	def removeNamespaces(tags):
	output = []
	for tag in tags:
	simple_tag = re.match(r'{.}(.)', tag)[1]
	output.append(simple_tag)

	return output

	if __name__=="__main__":
	main()
No results found