Skip to content

Instantly share code, notes, and snippets.

@lfoppiano
Created April 4, 2024 08:38
Show Gist options
  • Save lfoppiano/5b2a1ef545ca5d4dea9b8534c4ed54cf to your computer and use it in GitHub Desktop.
Save lfoppiano/5b2a1ef545ca5d4dea9b8534c4ed54cf to your computer and use it in GitHub Desktop.
Lookup Open Access PDF files from a list of DOIs using Biblio Glutton https://github.com/kermitt2/biblio-glutton
import argparse
import os
from pathlib import Path
import requests
# Constants
GLUTTON_URL = "ADD BIBLIO GLUTTON LOOKUP SERVICE"
def main(input_path: Path, output_path: Path, has_header: bool = False):
# Read input CSV file and process each row
with open(input_path) as f:
for doi in f:
doi = doi.strip("\n")
if not doi:
continue
payload = {'doi': doi, 'postValidate': False}
response = requests.get(GLUTTON_URL + "/service/lookup", params=payload)
if response.status_code == 200:
response_data = response.json()
# Extract relevant information from response
oa_link = response_data.get('oaLink', '')
url = response_data.get('URL', '')
# Prepare output record
output_record = {'id': doi, 'oaLink': oa_link, 'url': url}
output_filename_path = os.path.join(output_path, doi.replace("/", "_")) + ".pdf"
# Download PDF if available
if oa_link:
download_url = oa_link
print(f"Download {doi} from {download_url}")
# Download PDF
download_pdf(download_url, output_filename_path)
else:
print(f"{id}: invalid request")
print(response.content)
def download_pdf(download_url: str, output_filename: str) -> None:
"""Download PDF file from the given URL and save it."""
try:
response = requests.get(download_url, stream=True)
with open(output_filename, 'wb') as fd:
for chunk in response.iter_content(chunk_size=128):
fd.write(chunk)
except Exception as e:
print(f"Error downloading {output_filename}: {str(e)}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Download open access PDF using Biblio-glutton with a list of DOIs")
parser.add_argument("--input",
help="Input text file, one DOI per line.",
required=True)
parser.add_argument("--output",
help="Output directory",
required=True)
args = parser.parse_args()
input = args.input
output = args.output
input_path = Path(input)
output_path = Path(output)
main(input_path, output_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment