Last active
December 13, 2023 06:17
-
-
Save lucainnocenti/bbfee069c5bc87c08a308753513acb96 to your computer and use it in GitHub Desktop.
Python script to sort the bibitem entries according to the citation order in the text. Useful when one has for some reason to not use bibtex and .bib files. Usage is as `python bibitem-sorter.py file.tex sorted_file.tex`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
import sys | |
import re | |
def printBold(msg): | |
HEADER = '\033[95m' | |
BOLD = '\033[1m' | |
print(HEADER + BOLD + msg + '\x1b[0m') | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('path', type=str) | |
parser.add_argument('output_path', type=str) | |
args = parser.parse_args() | |
# read from file | |
with open(args.path, 'r') as fp: | |
text = fp.read() | |
# backslashes are fucking annoying | |
text = re.sub('\\\\', '\\\\\\\\', text) | |
# extract text inside thebibliography environment | |
matcher = re.compile(r'\\\\begin{thebibliography}.*\n' | |
r'([\s\S]*)' | |
r'\\\\end{thebibliography}') | |
bibitems_text = matcher.findall(text)[0] | |
matcher = re.compile(r'\\\\cite{([^}]+)}') | |
cites = [] | |
for match in matcher.findall(text): | |
if match in cites: | |
continue | |
if ',' in match: | |
matches = [x.strip() for x in match.split(',')] | |
for smatch in matches: | |
if smatch not in cites: | |
cites.append(smatch) | |
continue | |
cites.append(match) | |
bibitems = [] | |
matcher = re.compile(r'\\\\bibitem{([^}]+)}\s*((?:.*\n)*?)(?=\n)') | |
for match in matcher.findall(bibitems_text): | |
bibitems.append(match) | |
# check that everything has a reference and that every reference has been | |
# cited | |
bibitems_refs = [item[0] for item in bibitems] | |
unused_bibitems = [] | |
for cite in cites: | |
if cite not in bibitems_refs: | |
raise ValueError('Citing missing reference: {}.'.format(cite)) | |
for bibitem in bibitems: | |
if bibitem[0] not in cites: | |
# raise ValueError('Uncited reference: {}.'.format(bibitem[0])) | |
printBold('WARNING: Uncited reference: {}.'.format(bibitem[0])) | |
unused_bibitems.append(bibitem) | |
for unused_bibitem in unused_bibitems: | |
del bibitems_refs[bibitems.index(unused_bibitem)] | |
del bibitems[bibitems.index(unused_bibitem)] | |
# check if already sorted | |
same = cites == [bibitem[0] for bibitem in bibitems] | |
if same: | |
printBold('References are sorted') | |
else: | |
printBold('References are not sorted, proceeding to sort them.') | |
# sort references | |
sorted_bibitems = sorted(bibitems, key=lambda x: cites.index(x[0])) | |
sorted_bibitems += unused_bibitems | |
output_bibitem_text = r'\n\n' | |
for bibitem in sorted_bibitems: | |
output_bibitem_text += r'\\\\bibitem{{{0}}}\n{1}\n'.format( | |
bibitem[0], bibitem[1]) | |
output_bibitem_text += r'\n' | |
# do the replacement | |
output_text = re.sub( | |
(r'(?<=\\\\begin{thebibliography})' | |
r'(\S*)[\s\S]*' | |
r'(?=\\\\end{thebibliography})'), | |
r'\1\n' + output_bibitem_text, | |
text | |
) | |
# convert back @ to \ | |
output_text = re.sub('\\\\\\\\', '\\\\', output_text) | |
# save result | |
printBold('Done. Saving results to {}.'.format(args.output_path)) | |
with open(args.output_path, 'w') as fp: | |
fp.write(output_text) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment