Created
April 4, 2024 08:38
-
-
Save lfoppiano/5b2a1ef545ca5d4dea9b8534c4ed54cf to your computer and use it in GitHub Desktop.
Lookup Open Access PDF files from a list of DOIs using Biblio Glutton https://github.com/kermitt2/biblio-glutton
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
from pathlib import Path | |
import requests | |
# Constants | |
GLUTTON_URL = "ADD BIBLIO GLUTTON LOOKUP SERVICE" | |
def main(input_path: Path, output_path: Path, has_header: bool = False): | |
# Read input CSV file and process each row | |
with open(input_path) as f: | |
for doi in f: | |
doi = doi.strip("\n") | |
if not doi: | |
continue | |
payload = {'doi': doi, 'postValidate': False} | |
response = requests.get(GLUTTON_URL + "/service/lookup", params=payload) | |
if response.status_code == 200: | |
response_data = response.json() | |
# Extract relevant information from response | |
oa_link = response_data.get('oaLink', '') | |
url = response_data.get('URL', '') | |
# Prepare output record | |
output_record = {'id': doi, 'oaLink': oa_link, 'url': url} | |
output_filename_path = os.path.join(output_path, doi.replace("/", "_")) + ".pdf" | |
# Download PDF if available | |
if oa_link: | |
download_url = oa_link | |
print(f"Download {doi} from {download_url}") | |
# Download PDF | |
download_pdf(download_url, output_filename_path) | |
else: | |
print(f"{id}: invalid request") | |
print(response.content) | |
def download_pdf(download_url: str, output_filename: str) -> None: | |
"""Download PDF file from the given URL and save it.""" | |
try: | |
response = requests.get(download_url, stream=True) | |
with open(output_filename, 'wb') as fd: | |
for chunk in response.iter_content(chunk_size=128): | |
fd.write(chunk) | |
except Exception as e: | |
print(f"Error downloading {output_filename}: {str(e)}") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description="Download open access PDF using Biblio-glutton with a list of DOIs") | |
parser.add_argument("--input", | |
help="Input text file, one DOI per line.", | |
required=True) | |
parser.add_argument("--output", | |
help="Output directory", | |
required=True) | |
args = parser.parse_args() | |
input = args.input | |
output = args.output | |
input_path = Path(input) | |
output_path = Path(output) | |
main(input_path, output_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment