Skip to content

Instantly share code, notes, and snippets.

@riasat-sheikh
Created June 29, 2025 06:39
Show Gist options
  • Save riasat-sheikh/07aec09f8687565f0a3e4646275454db to your computer and use it in GitHub Desktop.
Save riasat-sheikh/07aec09f8687565f0a3e4646275454db to your computer and use it in GitHub Desktop.
arXiv downloader
import os
import requests
import feedparser
from tqdm import tqdm
from PyPDF2 import PdfReader
# Set search parameters
search_query = "pNGB" # keyword
max_results = 10000 # change as needed
output_dir = "arxiv_pngb_pdfs"
os.makedirs(output_dir, exist_ok=True)
# Fetch data from arXiv API
base_url = "http://export.arxiv.org/api/query?"
query = f"search_query=all:{search_query}&start=0&max_results={max_results}"
feed = feedparser.parse(base_url + query)
print(f"Total results available from arXiv: {feed.feed.opensearch_totalresults}")
# Download PDFs
for entry in tqdm(feed.entries, desc="Downloading PDFs"):
arxiv_id = entry.id.split("/abs/")[-1]
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
filename = f"{arxiv_id.replace('/', '_')}.pdf"
filepath = os.path.join(output_dir, filename)
file_exists = os.path.exists(filepath)
corrupted = False
if file_exists:
try:
with open(filepath, "rb") as f:
PdfReader(f)
except Exception:
corrupted = True
if file_exists and not corrupted:
print(f"\nSkipping (already exists and valid): {arxiv_id}")
continue
elif corrupted:
print(f"\nRedownloading corrupted file: {arxiv_id}")
print(f"\nDownloading: {arxiv_id}")
print(f"Title : {entry.title}")
print(f"PDF URL : {pdf_url}")
response = requests.get(pdf_url, stream=True)
total_size = int(response.headers.get("content-length", 0))
block_size = 1024
with open(filepath, "wb") as f, tqdm(
total=total_size, unit="B", unit_scale=True, desc=filename, leave=False
) as progress_bar:
for data in response.iter_content(block_size):
f.write(data)
progress_bar.update(len(data))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment