etienned · November 21, 2022 13:56 · AgnijDave · May 31, 2021
diff --git a/extractdocx.py b/extractdocx.py
 try:
    from xml.etree.cElementTree import XML
 except ImportError:
    from xml.etree.ElementTree import XML
 import zipfile


 """
 Module that extract text from MS XML Word document (.docx).
 (Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
 """

 WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
 PARA = WORD_NAMESPACE + 'p'
 TEXT = WORD_NAMESPACE + 't'


 def get_docx_text(path):
    """
    Take the path of a docx file as argument, return the text in unicode.
    """
    document = zipfile.ZipFile(path)
    xml_content = document.read('word/document.xml')
    document.close()
    tree = XML(xml_content)

    paragraphs = []
    for paragraph in tree.getiterator(PARA):
        texts = [node.text
                 for node in paragraph.getiterator(TEXT)
                 if node.text]
        if texts:
            paragraphs.append(''.join(texts))

    return '\n\n'.join(paragraphs)
	try:
	from xml.etree.cElementTree import XML
	except ImportError:
	from xml.etree.ElementTree import XML
	import zipfile


	"""
	Module that extract text from MS XML Word document (.docx).
	(Inspired by python-docx <https://github.com/mikemaccana/python-docx>)
	"""

	WORD_NAMESPACE = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}'
	PARA = WORD_NAMESPACE + 'p'
	TEXT = WORD_NAMESPACE + 't'


	def get_docx_text(path):
	"""
	Take the path of a docx file as argument, return the text in unicode.
	"""
	document = zipfile.ZipFile(path)
	xml_content = document.read('word/document.xml')
	document.close()
	tree = XML(xml_content)

	paragraphs = []
	for paragraph in tree.getiterator(PARA):
	texts = [node.text
	for node in paragraph.getiterator(TEXT)
	if node.text]
	if texts:
	paragraphs.append(''.join(texts))

	return '\n\n'.join(paragraphs)