Skip to content

Instantly share code, notes, and snippets.

@jamespaultg
Created April 14, 2020 18:09
Show Gist options
  • Save jamespaultg/42cd539bd39613a4e73a0be13dfa85f4 to your computer and use it in GitHub Desktop.
Save jamespaultg/42cd539bd39613a4e73a0be13dfa85f4 to your computer and use it in GitHub Desktop.
Read word document in Python
!pip3 install python-docx -q
import docx2txt
# replace following line with location of your .docx file
wordfile = "your word document.docx"
# get the contents of the word document
def getDocxContent(filename):
doc = docx.Document(filename)
fullText = ""
for para in doc.paragraphs:
fullText += para.text
return fullText
doc_content = getDocxContent(wordfile)
# Extracting paragraph text
doc = docx.Document(wordfile)
doc.paragraphs[0].text
# Extraction text from different sections
sections = doc.sections
len(doc.sections)
for section in sections:
print(section.start_type)
# get headings
def iter_headings(paragraphs):
for paragraph in paragraphs:
print(paragraph.style.name)
if paragraph.style.name.startswith('Heading'):
yield paragraph
for heading in iter_headings(doc.paragraphs[1:100]):
print(heading.text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment