Skip to content

Instantly share code, notes, and snippets.

@aryaniyaps
Last active March 19, 2025 13:48
Show Gist options
  • Save aryaniyaps/63f30f0c10950249ba30f1bc49c7db1b to your computer and use it in GitHub Desktop.
Save aryaniyaps/63f30f0c10950249ba30f1bc49c7db1b to your computer and use it in GitHub Desktop.
RBI Web Scraping
import os
import time
import requests
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
from tenacity import retry, stop_after_attempt, wait_exponential
BASE_URL = "https://rbi.org.in/Scripts/BS_ViewBulletin.aspx"
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
TARGET_FILE = "Money Stock Measures"
# Retry mechanism for downloads
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
def download_file(url, file_name):
try:
response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=10)
response.raise_for_status()
with open(file_name, "wb") as file:
file.write(response.content)
print(f" ✅ Downloaded: {file_name}")
except Exception as e:
print(f" ❌ Failed to download {file_name}: {e}")
raise e
def safe_click(page, selector, retries=3):
"""Safely click an element with retries"""
for attempt in range(1, retries + 1):
try:
page.locator(selector).click(timeout=5000)
return
except PlaywrightTimeoutError as e:
print(f"Attempt {attempt}/{retries} failed to click {selector}: {e}")
time.sleep(2)
print(f"⚠️ Skipping {selector} after {retries} attempts")
def scrape_rbi_bulletin():
# Ensure data/ directory exists
os.makedirs("data", exist_ok=True)
with sync_playwright() as p:
# Launch browser with retries
for attempt in range(3):
try:
browser = p.chromium.launch(headless=True)
context = browser.new_context(user_agent=USER_AGENT)
page = context.new_page()
page.goto(BASE_URL, timeout=10000)
break
except Exception as e:
print(f"🔴 Failed to load page (attempt {attempt+1}/3): {e}")
time.sleep(3)
else:
print("🔴 Critical: Unable to load the RBI bulletin page after 3 attempts.")
return
# Wait for year buttons to load
try:
page.wait_for_selector(".accordionButton.year", timeout=7000)
except PlaywrightTimeoutError:
print("🔴 Failed to load year buttons.")
browser.close()
return
year_buttons = page.locator(".accordionButton.year")
years = year_buttons.all_inner_texts()
for year in years:
if year == "Archives":
continue
try:
print(f"📌 Selecting Year: {year}")
safe_click(page, ".accordionButton.year:has-text('Archives')")
safe_click(page, f".accordionButton.year:has-text('{year}')")
# Wait for month buttons to load
page.wait_for_selector(
f".accordionContent.month[id='{year}']", timeout=5000
)
month_buttons = page.locator(f".accordionContent.month[id='{year}'] a")
months = month_buttons.all_inner_texts()
except Exception as e:
print(f"⚠️ Could not fetch months for year {year}: {e}")
continue
for month in months:
if month == "All Months":
continue
print(f"📌 Selecting Month: {month}")
safe_click(page, f".accordionButton.year:has-text('{year}')")
try:
safe_click(
page,
f".accordionContent.month[id='{year}'] a:has-text('{month}')",
)
page.wait_for_timeout(2000)
except Exception as e:
print(f"⚠️ Could not click month {month} for year {year}: {e}")
continue
try:
# Locate the row containing "Money Stock Measures"
row = page.locator(f'table.tablebg tr:has-text("{TARGET_FILE}")')
if row.count() == 0:
print(f" ⚠️ No {TARGET_FILE} found for {year} {month}")
continue
file_url = row.locator("td:nth-child(2) a").get_attribute("href")
# Construct file name
file_name = os.path.join("data", f"{year}_{month}.xlsx")
print(f" 💾 Downloading: {file_name} from {file_url}")
download_file(file_url, file_name)
except Exception as e:
print(
f" ❌ Could not find/download {TARGET_FILE} for {year} {month}: {e}"
)
# Close the browser
browser.close()
if __name__ == "__main__":
scrape_rbi_bulletin()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment