Last active
March 19, 2025 13:48
-
-
Save aryaniyaps/63f30f0c10950249ba30f1bc49c7db1b to your computer and use it in GitHub Desktop.
RBI Web Scraping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import requests | |
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError | |
from tenacity import retry, stop_after_attempt, wait_exponential | |
BASE_URL = "https://rbi.org.in/Scripts/BS_ViewBulletin.aspx" | |
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" | |
TARGET_FILE = "Money Stock Measures" | |
# Retry mechanism for downloads | |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10)) | |
def download_file(url, file_name): | |
try: | |
response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=10) | |
response.raise_for_status() | |
with open(file_name, "wb") as file: | |
file.write(response.content) | |
print(f" ✅ Downloaded: {file_name}") | |
except Exception as e: | |
print(f" ❌ Failed to download {file_name}: {e}") | |
raise e | |
def safe_click(page, selector, retries=3): | |
"""Safely click an element with retries""" | |
for attempt in range(1, retries + 1): | |
try: | |
page.locator(selector).click(timeout=5000) | |
return | |
except PlaywrightTimeoutError as e: | |
print(f"Attempt {attempt}/{retries} failed to click {selector}: {e}") | |
time.sleep(2) | |
print(f"⚠️ Skipping {selector} after {retries} attempts") | |
def scrape_rbi_bulletin(): | |
# Ensure data/ directory exists | |
os.makedirs("data", exist_ok=True) | |
with sync_playwright() as p: | |
# Launch browser with retries | |
for attempt in range(3): | |
try: | |
browser = p.chromium.launch(headless=True) | |
context = browser.new_context(user_agent=USER_AGENT) | |
page = context.new_page() | |
page.goto(BASE_URL, timeout=10000) | |
break | |
except Exception as e: | |
print(f"🔴 Failed to load page (attempt {attempt+1}/3): {e}") | |
time.sleep(3) | |
else: | |
print("🔴 Critical: Unable to load the RBI bulletin page after 3 attempts.") | |
return | |
# Wait for year buttons to load | |
try: | |
page.wait_for_selector(".accordionButton.year", timeout=7000) | |
except PlaywrightTimeoutError: | |
print("🔴 Failed to load year buttons.") | |
browser.close() | |
return | |
year_buttons = page.locator(".accordionButton.year") | |
years = year_buttons.all_inner_texts() | |
for year in years: | |
if year == "Archives": | |
continue | |
try: | |
print(f"📌 Selecting Year: {year}") | |
safe_click(page, ".accordionButton.year:has-text('Archives')") | |
safe_click(page, f".accordionButton.year:has-text('{year}')") | |
# Wait for month buttons to load | |
page.wait_for_selector( | |
f".accordionContent.month[id='{year}']", timeout=5000 | |
) | |
month_buttons = page.locator(f".accordionContent.month[id='{year}'] a") | |
months = month_buttons.all_inner_texts() | |
except Exception as e: | |
print(f"⚠️ Could not fetch months for year {year}: {e}") | |
continue | |
for month in months: | |
if month == "All Months": | |
continue | |
print(f"📌 Selecting Month: {month}") | |
safe_click(page, f".accordionButton.year:has-text('{year}')") | |
try: | |
safe_click( | |
page, | |
f".accordionContent.month[id='{year}'] a:has-text('{month}')", | |
) | |
page.wait_for_timeout(2000) | |
except Exception as e: | |
print(f"⚠️ Could not click month {month} for year {year}: {e}") | |
continue | |
try: | |
# Locate the row containing "Money Stock Measures" | |
row = page.locator(f'table.tablebg tr:has-text("{TARGET_FILE}")') | |
if row.count() == 0: | |
print(f" ⚠️ No {TARGET_FILE} found for {year} {month}") | |
continue | |
file_url = row.locator("td:nth-child(2) a").get_attribute("href") | |
# Construct file name | |
file_name = os.path.join("data", f"{year}_{month}.xlsx") | |
print(f" 💾 Downloading: {file_name} from {file_url}") | |
download_file(file_url, file_name) | |
except Exception as e: | |
print( | |
f" ❌ Could not find/download {TARGET_FILE} for {year} {month}: {e}" | |
) | |
# Close the browser | |
browser.close() | |
if __name__ == "__main__": | |
scrape_rbi_bulletin() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment