Skip to content

Instantly share code, notes, and snippets.

@yangyushi
Last active May 16, 2020 11:35
Show Gist options
  • Save yangyushi/70d3cb6daf62363cf4b630ca75d4f15e to your computer and use it in GitHub Desktop.
Save yangyushi/70d3cb6daf62363cf4b630ca75d4f15e to your computer and use it in GitHub Desktop.
format the entry key names in your bib file to please your supervisor
#!/usr/bin/env python
import re
import sys
from textwrap import dedent
STOPWORDS = [ # common stop words from package nltk
"i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "you're", "you've", "you'll",
"you'd", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "she's",
"her", "hers", "herself", "it", "it's", "its", "itself", "they", "them", "their", "theirs",
"themselves", "what", "which", "who", "whom", "this", "that", "that'll", "these", "those", "am",
"is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
"did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while",
"of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before",
"after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under",
"again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
"both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own",
"same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "don't", "should",
"should've", "now", "d", "ll", "m", "o", "re", "ve", "y", "ain", "aren", "aren't", "couldn",
"couldn't", "didn", "didn't", "doesn", "doesn't", "hadn", "hadn't", "hasn", "hasn't", "haven",
"haven't", "isn", "isn't", "ma", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't",
"shan", "shan't", "shouldn", "shouldn't", "wasn", "wasn't", "weren", "weren't", "won", "won't",
"wouldn", "wouldn't"
]
def detect_entry(i, content):
"""
Determine if a bib entry starts at position i
"""
end = 0
for idx in range(i, len(content)):
if content[idx] == "\n":
end = idx
break
return re.match(r"@\w+{.+", content[i:end]) is not None
def parse_file(bib_content):
cursor = 0
stack = []
in_entry = False
in_content = False
for i, letter in enumerate(bib_content):
if letter == "@":
if detect_entry(i, bib_content):
cursor = i
in_entry = True
if (letter == '{') and in_entry:
stack.append(1)
in_content = True
if (letter == '}') and in_content:
stack.pop()
if in_content and (not stack):
in_entry = False
in_content = False
yield bib_content[cursor:i+1]
def parse_entry(entry):
entry_pattern = re.compile(
r'@(\w+){([\W\w]+)}', re.S
)
article_type, content = entry_pattern.match(entry).group(1, 2)
return article_type, content
def get_citekey(content):
return content.split(',\n')[0]
def get_author(content):
fetch = re.search('author = {(.+)}', content)
if fetch:
authors = fetch.group(1)
else:
authors = 'anonymous'
first_author = authors.split(',')[0]
result = re.sub(r'[\\{}]', '', first_author) # remove modifier such as \v and \c
result = re.sub(r'\\v', '', first_author) # remove modifier such as \v and \c
result = re.sub(r'\\c', '', first_author) # remove modifier such as \v and \c
result = re.sub('\W+', '', result).lower() # remove non ASCII characters
return result
def get_year(content):
# year = {YEAR}
fetch = re.search('year = {(\d+)}', content)
if fetch:
return fetch.group(1)
# date = {YEAR*}
fetch = re.search('date = {(\d+)\D*.*}', content)
if fetch:
return fetch.group(1)
return ''
def get_journal(content):
if 'journal' not in content:
return 'notpaper'
else:
for keyname in ["journal", "journaltitle"]:
search = re.search(r'%s = {(.+)}' % keyname, content)
if search:
journal = search.group(1)
words = [word.lower() for word in journal.split(' ') if re.match('\w+', word)]
words = [word[0] for word in words if word not in STOPWORDS]
abbr = ''.join(words)
if len(abbr) > 1:
return abbr
else:
return journal.lower()
return ""
def reformat_citekey(content, mode='ay', suffix=''):
year = get_year(content)
author = get_author(content)
if mode == 'ay': # author-year
citekey = author + year
elif mode == 'ayj':
journal = get_journal(content)
citekey = author + year + journal
citekey += suffix
return citekey, re.sub(get_citekey(content), citekey, content)
if __name__ == "__main__":
help_msg = dedent("""
usage: ./reform.py input_bib [output_bib]
(if output_bib not provided, overwriting input_bib)
""")
if len(sys.argv) not in [2, 3]:
print(help_msg)
exit()
elif len(sys.argv) == 2:
if sys.argv[1] in ["-h", "h", "--help", "help"]:
print(help_msg)
exit()
else:
filename = sys.argv[1]
new_filename = sys.argv[1]
else:
filename = sys.argv[1]
new_filename = sys.argv[2]
with open(filename, 'r') as f:
content = f.read()
entries = {}
for i, entry in enumerate(parse_file(content)):
article_type, content = parse_entry(entry)
trial_key = get_author(content) + get_year(content)
if trial_key not in entries.keys():
key, content = reformat_citekey(content, mode='ay')
else:
trial_key = get_author(content) + get_year(content) + get_journal(content)
num, suffix = 0, ''
while trial_key in entries.keys():
num += 1
suffix = str(num)
trial_key += suffix
key, content = reformat_citekey(content, mode='ayj', suffix=suffix)
entries.update({
key: {'article_type': article_type, 'content': content}
})
new_content = ''
for key in entries:
new_content += '@%s{%s}\n\n' % (
entries[key]['article_type'],
entries[key]['content'],
)
print("{} entries processed".format(i+1))
with open(new_filename, 'w') as f:
f.write(new_content)
@yangyushi
Copy link
Author

yangyushi commented Apr 8, 2020

This code convert the reference names inside a .bib file, following

Author family time and year with four digits, i.e. johnson2016
If there are multiple papers for that author (or another with the same name) in et same year, use abbreviations of journals, eg prl, jcp, sm.

It is compatible with the output of following software

  1. Papers (Mac only)
  2. Zotero

How to use

./reform.py input_bib [output_bib]

(if output_bib is not provided, the input_bib will be re-written)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment