Skip to content

Instantly share code, notes, and snippets.

@Ze1598
Last active April 5, 2020 15:42
Show Gist options
  • Save Ze1598/e4d7beb2585ff972bf2e7f1d40cd6a7e to your computer and use it in GitHub Desktop.
Save Ze1598/e4d7beb2585ff972bf2e7f1d40cd6a7e to your computer and use it in GitHub Desktop.
Find cited documents in Word (.docx) files
"""
Citation examples:
("Learning Analytics - Definitions, Processes and Potential", p. 2)
("Wanted: A road map for understanding Integrated Learning Systems")
"""
import docx2txt as docx
import re
# Open the document
text = docx.process("my_lorem_sample.docx")
# Specifiy a very large number so that each `replace()` call catches\
# all stylized double quotes
num_replaces = 100000000
# Replace stylized doubled quotes by the default double quotes
# https://www.w3schools.com/charsets/ref_utf_punctuation.asp
text = text.replace('“', '"', num_replaces).replace('”', '"', num_replaces).replace(
'„', '"', num_replaces).replace('‟', '"', num_replaces)
# Text between double quotes: https://stackoverflow.com/a/378447/9263761
# Regex pattern to find citation titles
pattern = r'\("[^"]*"'
# Try to find matches (returned as an iterator of matches)
results = re.finditer(pattern, text)
# Build a list with the citations found by looping through the matches
# Each match has the first and last indices of the match, relative to the original string
citations = [text[match.start(): match.end()] for match in results]
print("All citations found:", len(citations))
for citation in citations:
print(citation)
# Two blank lines
print()
print()
# Remove duplicate citations
unique_citations = list(set(citations))
print("Unique citations:", len(unique_citations))
for citation in unique_citations:
print(citation)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment