Last active
April 5, 2020 15:54
-
-
Save Ze1598/059d17510672d805baa3b13d0fa97821 to your computer and use it in GitHub Desktop.
Find APA style-cited documents in Word (.docx) files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Citation examples used in the sample text (random authors and titles with random dates) | |
| (Sabbagh, 2009) | |
| (Sabbagh, n.d.) | |
| (Sabbagh, 2010a) | |
| (Sabbagh, 2010b) | |
| (Qianyi Gu & Sumner, 2006) | |
| (Despotovic-Zrakic et al., 2012) | |
| (Anonymous, 2010) | |
| (Anonymous, n.d.) | |
| (“Barcelona to Ban Burqa,” 2010) | |
| """ | |
| import docx2txt as docx | |
| import re | |
| # Open the document | |
| text = docx.process("lorem_sample.docx") | |
| # Specifiy a very large number so that each `replace()` call catches\ | |
| # all stylized double quotes | |
| num_replaces = 100000000 | |
| # Replace stylized doubled quotes by the default double quotes | |
| # https://www.w3schools.com/charsets/ref_utf_punctuation.asp | |
| text = text.replace('“', '"', num_replaces).replace('”', '"', num_replaces).replace( | |
| '„', '"', num_replaces).replace('‟', '"', num_replaces) | |
| # Text between double quotes: https://stackoverflow.com/a/378447/9263761 | |
| # Pattern to find all types of citations | |
| pattern = r'\(([^"\)]*|\bAnonymous\b|"[^"\)]*")(, )([\d]+|n\.d\.|[\d]+[\w])\)' | |
| # Try to find matches (returned as an iterator of matches) | |
| results = re.finditer(pattern, text) | |
| # Build a list with the citations found by looping through the matches | |
| # Each match has the first and last indices of the match, relative to the original string | |
| citations = [text[match.start(): match.end()] for match in results] | |
| print("All citations found:", len(citations)) | |
| for citation in citations: | |
| print(citation) | |
| # Two blank lines | |
| print() | |
| print() | |
| # Remove duplicate citations | |
| unique_citations = list(set(citations)) | |
| print("Unique citations:", len(unique_citations)) | |
| for citation in unique_citations: | |
| print(citation) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment