Created
August 17, 2016 15:40
-
-
Save EoinTravers/d9eaee255a9702a75edee6e77ebabd77 to your computer and use it in GitHub Desktop.
Find and print references from an APA formatted paper.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
with open('Paper.txt', 'r') as f: | |
paper = f.read() | |
words = paper.split(' ') | |
refs = [] | |
go_back_by = 5 # Default number of words to show before year. | |
last = 0 | |
for i in range(len(words)): | |
w = words[i] | |
# If year in brackets found | |
if re.findall('\([0-9]{4}', w): | |
# Go back by either `go_back_by` words, or to the last ref found. | |
back_to = max(last, i-go_back_by) | |
ref = ' '.join(words[back_to:i+1]) | |
refs.append(ref) | |
last = i+1 | |
# If year not in brackets found (i.e. (Smith & Jones, 2016) | |
elif re.findall('[0-9]{4}', w): | |
back_to = max(last, i-5) | |
ref = ' '.join(words[back_to:i+1]) | |
# Strip everything leading up to the opening bracket. | |
ref = re.sub('.+?\(', '', ref, count=1).replace(')', '') | |
refs.append(ref) | |
last = i+1 | |
# You can stip out common cruft here. | |
refs = [ref.replace('see ', '').replace('e.g.', '') for ref in refs] | |
for ref in refs: | |
print ref |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment