Created
April 17, 2021 00:17
-
-
Save bnewbold/f55d616d891981c83835eb763b1419ed to your computer and use it in GitHub Desktop.
RSCVD Fatcat Fuzzycat Lookup
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env/python3 | |
""" | |
To run this script you need the 'fuzzycat' and 'elasticsearch' pip packages | |
installed (eg, 'pip install fuzzycat') | |
""" | |
import sys | |
import csv | |
import json | |
import elasticsearch | |
from fuzzycat.simple import closest_fuzzy_biblio_match | |
from fuzzycat.matching import public_api | |
def first_ia_access(release): | |
for f in (release.files or []): | |
for u in (f.urls or []): | |
if "://web.archive.org/" in u.url or "://archive.org/" in u.url: | |
return u.url | |
for w in (release.webcaptures or []): | |
for u in w.archive_urls: | |
if "://web.archive.org/" in u.url: | |
return u.url | |
return None | |
def run(tsv_input): | |
api = public_api("https://api.fatcat.wiki/v0") | |
es_client = elasticsearch.Elasticsearch("https://search.fatcat.wiki:443") | |
reader = csv.DictReader(tsv_input, delimiter="\t") | |
for row in reader: | |
doi = row.get('DOI') or None | |
if doi and not doi.startswith('10.'): | |
doi = None | |
match = closest_fuzzy_biblio_match( | |
es_client=es_client, | |
biblio=dict( | |
title=row.get('Article') or None, | |
journal=row.get('Journal') or None, | |
first_author=row.get('Author Name') or None, | |
year=row.get('Publication Year') or None, | |
volume=row.get('Volume / Edition') or None, | |
issue=row.get('Issue') or None, | |
pages=row.get('Pages') or None, | |
doi=doi, | |
pmid=row.get('pmid') or None, | |
pcmid=row.get('pmcid') or None, | |
) | |
) | |
if match: | |
row['fuzzycat_status'] = match.status.name | |
if match.release: | |
match.release = api.get_release(match.release.ident, expand="container,files,webcaptures") | |
row['fatcat_url'] = f"https://fatcat.wiki/release/{match.release.ident}" | |
row['ia_access_url'] = first_ia_access(match.release) | |
if match.release.container and match.release.container.extra: | |
#print(match.release.container.extra.get('ia'), file=sys.stderr) | |
row['ia_any_sim'] = bool((match.release.container.extra.get('ia') or {}).get('sim')) | |
else: | |
row['ia_any_sim'] = False | |
else: | |
row['fuzzycat_status'] = "NONE" | |
print(json.dumps(row)) | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print("Expect a single argument: RSCVD TSV file (exported from google sheets") | |
sys.exit(-1) | |
with open(sys.argv[1], "r") as tsv_input: | |
run(tsv_input) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment