aryaniyaps · March 19, 2025 13:48
diff --git a/main.py b/main.py
 import os
 import time
 import requests
 from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
 from tenacity import retry, stop_after_attempt, wait_exponential

 BASE_URL = "https://rbi.org.in/Scripts/BS_ViewBulletin.aspx"
 USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"


 TARGET_FILE = "Money Stock Measures"


 # Retry mechanism for downloads
 @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
 def download_file(url, file_name):
    try:
        response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=10)
        response.raise_for_status()
        with open(file_name, "wb") as file:
            file.write(response.content)
        print(f"    ✅ Downloaded: {file_name}")
    except Exception as e:
        print(f"    ❌ Failed to download {file_name}: {e}")
        raise e


 def safe_click(page, selector, retries=3):
    """Safely click an element with retries"""
    for attempt in range(1, retries + 1):
        try:
            page.locator(selector).click(timeout=5000)
            return
        except PlaywrightTimeoutError as e:
            print(f"Attempt {attempt}/{retries} failed to click {selector}: {e}")
            time.sleep(2)
    print(f"⚠️ Skipping {selector} after {retries} attempts")


 def scrape_rbi_bulletin():
    # Ensure data/ directory exists
    os.makedirs("data", exist_ok=True)

    with sync_playwright() as p:
        # Launch browser with retries
        for attempt in range(3):
            try:
                browser = p.chromium.launch(headless=True)
                context = browser.new_context(user_agent=USER_AGENT)
                page = context.new_page()
                page.goto(BASE_URL, timeout=10000)
                break
            except Exception as e:
                print(f"🔴 Failed to load page (attempt {attempt+1}/3): {e}")
                time.sleep(3)
        else:
            print("🔴 Critical: Unable to load the RBI bulletin page after 3 attempts.")
            return

        # Wait for year buttons to load
        try:
            page.wait_for_selector(".accordionButton.year", timeout=7000)
        except PlaywrightTimeoutError:
            print("🔴 Failed to load year buttons.")
            browser.close()
            return

        year_buttons = page.locator(".accordionButton.year")
        years = year_buttons.all_inner_texts()

        for year in years:
            if year == "Archives":
                continue

            try:
                print(f"📌 Selecting Year: {year}")
                safe_click(page, ".accordionButton.year:has-text('Archives')")
                safe_click(page, f".accordionButton.year:has-text('{year}')")

                # Wait for month buttons to load
                page.wait_for_selector(
                    f".accordionContent.month[id='{year}']", timeout=5000
                )
                month_buttons = page.locator(f".accordionContent.month[id='{year}'] a")
                months = month_buttons.all_inner_texts()
            except Exception as e:
                print(f"⚠️ Could not fetch months for year {year}: {e}")
                continue

            for month in months:
                if month == "All Months":
                    continue

                print(f"📌 Selecting Month: {month}")
                safe_click(page, f".accordionButton.year:has-text('{year}')")

                try:
                    safe_click(
                        page,
                        f".accordionContent.month[id='{year}'] a:has-text('{month}')",
                    )
                    page.wait_for_timeout(2000)
                except Exception as e:
                    print(f"⚠️ Could not click month {month} for year {year}: {e}")
                    continue

                try:
                    # Locate the row containing "Money Stock Measures"
                    row = page.locator(f'table.tablebg tr:has-text("{TARGET_FILE}")')
                    if row.count() == 0:
                        print(f"    ⚠️ No {TARGET_FILE} found for {year} {month}")
                        continue

                    file_url = row.locator("td:nth-child(2) a").get_attribute("href")

                    # Construct file name
                    file_name = os.path.join("data", f"{year}_{month}.xlsx")
                    print(f"    💾 Downloading: {file_name} from {file_url}")
                    download_file(file_url, file_name)
                except Exception as e:
                    print(
                        f"    ❌ Could not find/download {TARGET_FILE} for {year} {month}: {e}"
                    )

        # Close the browser
        browser.close()


 if __name__ == "__main__":
    scrape_rbi_bulletin()
	import os
	import time
	import requests
	from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
	from tenacity import retry, stop_after_attempt, wait_exponential

	BASE_URL = "https://rbi.org.in/Scripts/BS_ViewBulletin.aspx"
	USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"


	TARGET_FILE = "Money Stock Measures"


	# Retry mechanism for downloads
	@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=2, max=10))
	def download_file(url, file_name):
	try:
	response = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=10)
	response.raise_for_status()
	with open(file_name, "wb") as file:
	file.write(response.content)
	print(f" ✅ Downloaded: {file_name}")
	except Exception as e:
	print(f" ❌ Failed to download {file_name}: {e}")
	raise e


	def safe_click(page, selector, retries=3):
	"""Safely click an element with retries"""
	for attempt in range(1, retries + 1):
	try:
	page.locator(selector).click(timeout=5000)
	return
	except PlaywrightTimeoutError as e:
	print(f"Attempt {attempt}/{retries} failed to click {selector}: {e}")
	time.sleep(2)
	print(f"⚠️ Skipping {selector} after {retries} attempts")


	def scrape_rbi_bulletin():
	# Ensure data/ directory exists
	os.makedirs("data", exist_ok=True)

	with sync_playwright() as p:
	# Launch browser with retries
	for attempt in range(3):
	try:
	browser = p.chromium.launch(headless=True)
	context = browser.new_context(user_agent=USER_AGENT)
	page = context.new_page()
	page.goto(BASE_URL, timeout=10000)
	break
	except Exception as e:
	print(f"🔴 Failed to load page (attempt {attempt+1}/3): {e}")
	time.sleep(3)
	else:
	print("🔴 Critical: Unable to load the RBI bulletin page after 3 attempts.")
	return

	# Wait for year buttons to load
	try:
	page.wait_for_selector(".accordionButton.year", timeout=7000)
	except PlaywrightTimeoutError:
	print("🔴 Failed to load year buttons.")
	browser.close()
	return

	year_buttons = page.locator(".accordionButton.year")
	years = year_buttons.all_inner_texts()

	for year in years:
	if year == "Archives":
	continue

	try:
	print(f"📌 Selecting Year: {year}")
	safe_click(page, ".accordionButton.year:has-text('Archives')")
	safe_click(page, f".accordionButton.year:has-text('{year}')")

	# Wait for month buttons to load
	page.wait_for_selector(
	f".accordionContent.month[id='{year}']", timeout=5000
	)
	month_buttons = page.locator(f".accordionContent.month[id='{year}'] a")
	months = month_buttons.all_inner_texts()
	except Exception as e:
	print(f"⚠️ Could not fetch months for year {year}: {e}")
	continue

	for month in months:
	if month == "All Months":
	continue

	print(f"📌 Selecting Month: {month}")
	safe_click(page, f".accordionButton.year:has-text('{year}')")

	try:
	safe_click(
	page,
	f".accordionContent.month[id='{year}'] a:has-text('{month}')",
	)
	page.wait_for_timeout(2000)
	except Exception as e:
	print(f"⚠️ Could not click month {month} for year {year}: {e}")
	continue

	try:
	# Locate the row containing "Money Stock Measures"
	row = page.locator(f'table.tablebg tr:has-text("{TARGET_FILE}")')
	if row.count() == 0:
	print(f" ⚠️ No {TARGET_FILE} found for {year} {month}")
	continue

	file_url = row.locator("td:nth-child(2) a").get_attribute("href")

	# Construct file name
	file_name = os.path.join("data", f"{year}_{month}.xlsx")
	print(f" 💾 Downloading: {file_name} from {file_url}")
	download_file(file_url, file_name)
	except Exception as e:
	print(
	f" ❌ Could not find/download {TARGET_FILE} for {year} {month}: {e}"
	)

	# Close the browser
	browser.close()


	if __name__ == "__main__":
	scrape_rbi_bulletin()