Created
June 29, 2025 06:39
-
-
Save riasat-sheikh/07aec09f8687565f0a3e4646275454db to your computer and use it in GitHub Desktop.
arXiv downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
import feedparser | |
from tqdm import tqdm | |
from PyPDF2 import PdfReader | |
# Set search parameters | |
search_query = "pNGB" # keyword | |
max_results = 10000 # change as needed | |
output_dir = "arxiv_pngb_pdfs" | |
os.makedirs(output_dir, exist_ok=True) | |
# Fetch data from arXiv API | |
base_url = "http://export.arxiv.org/api/query?" | |
query = f"search_query=all:{search_query}&start=0&max_results={max_results}" | |
feed = feedparser.parse(base_url + query) | |
print(f"Total results available from arXiv: {feed.feed.opensearch_totalresults}") | |
# Download PDFs | |
for entry in tqdm(feed.entries, desc="Downloading PDFs"): | |
arxiv_id = entry.id.split("/abs/")[-1] | |
pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf" | |
filename = f"{arxiv_id.replace('/', '_')}.pdf" | |
filepath = os.path.join(output_dir, filename) | |
file_exists = os.path.exists(filepath) | |
corrupted = False | |
if file_exists: | |
try: | |
with open(filepath, "rb") as f: | |
PdfReader(f) | |
except Exception: | |
corrupted = True | |
if file_exists and not corrupted: | |
print(f"\nSkipping (already exists and valid): {arxiv_id}") | |
continue | |
elif corrupted: | |
print(f"\nRedownloading corrupted file: {arxiv_id}") | |
print(f"\nDownloading: {arxiv_id}") | |
print(f"Title : {entry.title}") | |
print(f"PDF URL : {pdf_url}") | |
response = requests.get(pdf_url, stream=True) | |
total_size = int(response.headers.get("content-length", 0)) | |
block_size = 1024 | |
with open(filepath, "wb") as f, tqdm( | |
total=total_size, unit="B", unit_scale=True, desc=filename, leave=False | |
) as progress_bar: | |
for data in response.iter_content(block_size): | |
f.write(data) | |
progress_bar.update(len(data)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment