Last active
April 5, 2020 15:42
-
-
Save Ze1598/e4d7beb2585ff972bf2e7f1d40cd6a7e to your computer and use it in GitHub Desktop.
Find cited documents in Word (.docx) files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Citation examples: | |
| ("Learning Analytics - Definitions, Processes and Potential", p. 2) | |
| ("Wanted: A road map for understanding Integrated Learning Systems") | |
| """ | |
| import docx2txt as docx | |
| import re | |
| # Open the document | |
| text = docx.process("my_lorem_sample.docx") | |
| # Specifiy a very large number so that each `replace()` call catches\ | |
| # all stylized double quotes | |
| num_replaces = 100000000 | |
| # Replace stylized doubled quotes by the default double quotes | |
| # https://www.w3schools.com/charsets/ref_utf_punctuation.asp | |
| text = text.replace('“', '"', num_replaces).replace('”', '"', num_replaces).replace( | |
| '„', '"', num_replaces).replace('‟', '"', num_replaces) | |
| # Text between double quotes: https://stackoverflow.com/a/378447/9263761 | |
| # Regex pattern to find citation titles | |
| pattern = r'\("[^"]*"' | |
| # Try to find matches (returned as an iterator of matches) | |
| results = re.finditer(pattern, text) | |
| # Build a list with the citations found by looping through the matches | |
| # Each match has the first and last indices of the match, relative to the original string | |
| citations = [text[match.start(): match.end()] for match in results] | |
| print("All citations found:", len(citations)) | |
| for citation in citations: | |
| print(citation) | |
| # Two blank lines | |
| print() | |
| print() | |
| # Remove duplicate citations | |
| unique_citations = list(set(citations)) | |
| print("Unique citations:", len(unique_citations)) | |
| for citation in unique_citations: | |
| print(citation) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment