Last active
July 28, 2021 09:12
-
-
Save kingjr/0dc61b1db2692dc6180c823642221fe8 to your computer and use it in GitHub Desktop.
get_citations.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import urllib.request | |
import subprocess | |
import pandas as pd # pip install pandas | |
import betterbib # pip install betterbib | |
import bibtexparser # pip install bibtexparser | |
from bibtexparser.bparser import BibTexParser | |
def fix_duplicated_entries(lines): | |
out = '' | |
entries = set() | |
for line in lines: | |
line = line.decode("ISO-8859-1") | |
if '@article{' in line or '@phdthesis' in line: | |
entry = line.split('{')[1].split(',')[0] | |
orig = entry | |
while entry in entries: | |
entry += '_dup' | |
entries.add(entry) | |
if entry != orig: | |
print(f'Duplicate entry: {orig} -> {entry}') | |
line = line.replace(orig, entry) | |
entries.add(entry) | |
out += line | |
return out | |
def download_bib(fname='citations.bib', url=None): | |
if url is None: | |
'URL not specificied. Default to downloading Jean-Remi King' | |
url = 'https://scholar.googleusercontent.com/citations?view_op=export_citations&user=' | |
url += 'XZOgIwEAAAAJ&citsig=AMD79ooAAAAAYQJVOIWP0ghV_gx88PW2T3FK25bf0z9K&hl=en' | |
data = urllib.request.urlopen(url) | |
data = fix_duplicated_entries(data) | |
with open(fname, 'w', encoding="ascii", errors='ignore') as f: | |
f.write(data) | |
def enrich_bib(src, dest=None): | |
print(f'Enrich {src}...') | |
if dest is None: | |
dest = src | |
assert os.path.isfile(src) | |
command = f'betterbib {src} {dest}' | |
process = subprocess.Popen(command.split(), stdout=subprocess.PIPE) | |
output, error = process.communicate() | |
if error is not None: | |
print(error.decode('utf-8')) | |
else: | |
output.decode('utf-8') | |
def bib_to_csv(src, dest=None): | |
if dest is None: | |
dest = src.replace('.bib', '.csv') | |
parser = BibTexParser(common_strings=True) | |
with open(src) as f: | |
bib = bibtexparser.load(f, parser=parser) | |
bib = pd.DataFrame(bib.entries) | |
bib.to_csv(dest) | |
return bib | |
def main(): | |
download_bib('citations.bib') | |
enrich_bib('citations.bib', 'citations_rich.bib') | |
csv = bib_to_csv('citations_rich.bib', 'citations_rich.csv') | |
return csv | |
def lsp_format(bib, year=2021): | |
bib.year = bib.year.fillna(0).astype(int) | |
bib = bib.query(f'year>={year}') | |
arxiv = bib[bib.journal.fillna('').str.contains('arXiv')] | |
bib.loc[arxiv.index, "doi"] = arxiv.journal.fillna('').apply(lambda x: x.split(' preprint ')[1]) | |
bib.loc[arxiv.index, "journal"] = arxiv.journal.fillna('').apply(lambda x: x.split(' preprint ')[0]) | |
replacements = [ | |
(' and ', '; '), | |
('JR', r'Jean-Rémi'), | |
('Remi', r'Rémi'), | |
("{\\'e}", r'é'), | |
('{\\"u}', r'ü'), | |
('{\\"e}', r'ë'), | |
('{\\c{c}}', r'ç'), | |
('{\^\i}', r'î'), | |
('{\`e}', r'è') | |
] | |
for input, output in replacements: | |
bib.author = bib.author.apply(lambda authors: authors.replace(input, output)) | |
def bold(t): | |
return '\033[1m' + t + '\033[0m' | |
for _, pub in bib.iterrows(): | |
authors = list() | |
for author in pub.author.split('; '): | |
if author == '': | |
continue | |
if author == 'others': | |
authors.append('et al.') | |
else: | |
last, first = author.split(', ') | |
first = '-'.join([f[0] for f in first.split('-')]) | |
authors.append(last +', ' + first +'.') | |
authors = ', '.join(authors) | |
title = pub.title.replace('{', '').replace('}', '') | |
line = f'• {authors} ({pub.year}). {title}.' | |
journal = 'journal' if not pd.isna(pub['journal']) else 'organization' | |
for k in (journal, 'volume', 'pages', 'doi'): | |
if pd.isna(pub[k]) or pub[k] == '': | |
continue | |
value = str(pub[k]) | |
if k == journal: | |
value = bold(value) | |
if k == 'pages': | |
value = value.replace('--', '-') | |
value = ''.join([c for c in value if not c.isalpha()]) | |
start = [i for i, c in enumerate(value) if c.isdigit()] | |
if not len(start): | |
continue | |
value = 'pp.' + value[start[0]:] | |
if k == 'doi': | |
value = 'doi:' + value | |
line += ', '+ value | |
print(line) | |
print('') | |
if __name__ == '__main__': | |
df = main() | |
lsp_format(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment