riasat-sheikh · June 29, 2025 06:39
diff --git a/arXiv_download.py b/arXiv_download.py
 import os
 import requests
 import feedparser
 from tqdm import tqdm
 from PyPDF2 import PdfReader

 # Set search parameters
 search_query = "pNGB"  # keyword
 max_results = 10000  # change as needed
 output_dir = "arxiv_pngb_pdfs"
 os.makedirs(output_dir, exist_ok=True)

 # Fetch data from arXiv API
 base_url = "http://export.arxiv.org/api/query?"
 query = f"search_query=all:{search_query}&start=0&max_results={max_results}"
 feed = feedparser.parse(base_url + query)
 print(f"Total results available from arXiv: {feed.feed.opensearch_totalresults}")

 # Download PDFs
 for entry in tqdm(feed.entries, desc="Downloading PDFs"):
    arxiv_id = entry.id.split("/abs/")[-1]
    pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
    filename = f"{arxiv_id.replace('/', '_')}.pdf"
    filepath = os.path.join(output_dir, filename)

    file_exists = os.path.exists(filepath)
    corrupted = False
    if file_exists:
        try:
            with open(filepath, "rb") as f:
                PdfReader(f)
        except Exception:
            corrupted = True

    if file_exists and not corrupted:
        print(f"\nSkipping (already exists and valid): {arxiv_id}")
        continue
    elif corrupted:
        print(f"\nRedownloading corrupted file: {arxiv_id}")

    print(f"\nDownloading: {arxiv_id}")
    print(f"Title     : {entry.title}")
    print(f"PDF URL   : {pdf_url}")
    response = requests.get(pdf_url, stream=True)
    total_size = int(response.headers.get("content-length", 0))
    block_size = 1024
    with open(filepath, "wb") as f, tqdm(
        total=total_size, unit="B", unit_scale=True, desc=filename, leave=False
    ) as progress_bar:
        for data in response.iter_content(block_size):
            f.write(data)
            progress_bar.update(len(data))
	import os
	import requests
	import feedparser
	from tqdm import tqdm
	from PyPDF2 import PdfReader

	# Set search parameters
	search_query = "pNGB" # keyword
	max_results = 10000 # change as needed
	output_dir = "arxiv_pngb_pdfs"
	os.makedirs(output_dir, exist_ok=True)

	# Fetch data from arXiv API
	base_url = "http://export.arxiv.org/api/query?"
	query = f"search_query=all:{search_query}&start=0&max_results={max_results}"
	feed = feedparser.parse(base_url + query)
	print(f"Total results available from arXiv: {feed.feed.opensearch_totalresults}")

	# Download PDFs
	for entry in tqdm(feed.entries, desc="Downloading PDFs"):
	arxiv_id = entry.id.split("/abs/")[-1]
	pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
	filename = f"{arxiv_id.replace('/', '_')}.pdf"
	filepath = os.path.join(output_dir, filename)

	file_exists = os.path.exists(filepath)
	corrupted = False
	if file_exists:
	try:
	with open(filepath, "rb") as f:
	PdfReader(f)
	except Exception:
	corrupted = True

	if file_exists and not corrupted:
	print(f"\nSkipping (already exists and valid): {arxiv_id}")
	continue
	elif corrupted:
	print(f"\nRedownloading corrupted file: {arxiv_id}")

	print(f"\nDownloading: {arxiv_id}")
	print(f"Title : {entry.title}")
	print(f"PDF URL : {pdf_url}")
	response = requests.get(pdf_url, stream=True)
	total_size = int(response.headers.get("content-length", 0))
	block_size = 1024
	with open(filepath, "wb") as f, tqdm(
	total=total_size, unit="B", unit_scale=True, desc=filename, leave=False
	) as progress_bar:
	for data in response.iter_content(block_size):
	f.write(data)
	progress_bar.update(len(data))