rafaelquintanilha · December 5, 2024 13:54
diff --git a/scrape_ibx100.py b/scrape_ibx100.py
 # Video: https://www.youtube.com/watch?v=PRkVIBqC-is
 import time

 import pandas as pd
 from bs4 import BeautifulSoup
 from playwright.sync_api import sync_playwright

 from quantbrasil.models.asset import get_ticker_map
 from quantbrasil.models.portfolio import add_assets, create, remove_all


 def parse_table_beautifulsoup(table_html):
    """
    Parse HTML table using BeautifulSoup
    Returns a list of dictionaries with the parsed data
    """
    soup = BeautifulSoup(table_html, "html.parser")

    # Extract headers
    headers = []
    for th in soup.thead.find_all("th"):
        headers.append(th.text.strip())

    # Extract rows
    rows = []
    for tr in soup.tbody.find_all("tr"):
        row = {}
        for idx, td in enumerate(tr.find_all("td")):
            # Clean and format the data
            value = td.text.strip()

            # Convert numeric values
            if idx == 3:  # Qtde. Teórica column
                value = int(value.replace(".", ""))
            elif idx == 4:  # Part. (%) column
                value = float(value.replace(",", "."))

            row[headers[idx]] = value
        rows.append(row)

    return rows


 def parse_table_pandas(table_html):
    """
    Parse HTML table using pandas
    Returns a DataFrame with the parsed data
    """
    # Read HTML table
    df = pd.read_html(f"<table>{table_html}</table>")[0]

    # Clean column names
    df.columns = [col.strip() for col in df.columns]

    # Convert 'Qtde. Teórica' to numeric, removing dots
    df["Qtde. Teórica"] = df["Qtde. Teórica"].apply(
        lambda x: int(str(x).replace(".", ""))
    )

    # Convert 'Part. (%)' to numeric, replacing comma with dot
    df["Part. (%)"] = df["Part. (%)"].apply(lambda x: float(str(x).replace(",", ".")))

    return df


 # Example usage
 def process_table(table_html, method="pandas"):
    try:
        if method == "pandas":
            data = parse_table_pandas(table_html)
            print(f"Successfully parsed {len(data)} rows using pandas")
            return data
        else:
            data = parse_table_beautifulsoup(table_html)
            print(f"Successfully parsed {len(data)} rows using BeautifulSoup")
            return data
    except Exception as e:
        print(f"Error parsing table: {str(e)}")
        return None


 b3_url = "https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBXX?language=pt-br"

 with sync_playwright() as p:
    browser = p.chromium.launch()
    page = browser.new_page()
    page.goto(b3_url)

    # # wait for the table to load
    table = page.wait_for_selector("table.table-responsive-sm.table-responsive-md")
    select = page.locator("#selectPage")
    select.select_option("120")

    # wait a few seconds
    time.sleep(2)

    # wait for the table to load
    table = page.wait_for_selector("table.table-responsive-sm.table-responsive-md")
    table_html = table.inner_html()

    data = process_table(table_html, method="beautifulsoup")

    tickers = []
    weights = []

    for row in data:
        ticker = row.get("Código")
        weight = row.get("Part. (%)")

        tickers.append(ticker)
        weights.append(weight)

    print(tickers)
    print(f"Found {len(tickers)} tickers")

    browser.close()

    portfolio_id = create("IBX100")
    print(f"Created portfolio with id {portfolio_id}")

    remove_all(portfolio_id)
    print(f"Removed all assets from portfolio {portfolio_id}")

    ticker_map = get_ticker_map(tickers)

    asset_ids = [ticker_map[ticker] for ticker in tickers]

    add_assets(asset_ids, portfolio_id, weights)

    print(f"Added {len(asset_ids)} assets to portfolio")
	# Video: https://www.youtube.com/watch?v=PRkVIBqC-is
	import time

	import pandas as pd
	from bs4 import BeautifulSoup
	from playwright.sync_api import sync_playwright

	from quantbrasil.models.asset import get_ticker_map
	from quantbrasil.models.portfolio import add_assets, create, remove_all


	def parse_table_beautifulsoup(table_html):
	"""
	Parse HTML table using BeautifulSoup
	Returns a list of dictionaries with the parsed data
	"""
	soup = BeautifulSoup(table_html, "html.parser")

	# Extract headers
	headers = []
	for th in soup.thead.find_all("th"):
	headers.append(th.text.strip())

	# Extract rows
	rows = []
	for tr in soup.tbody.find_all("tr"):
	row = {}
	for idx, td in enumerate(tr.find_all("td")):
	# Clean and format the data
	value = td.text.strip()

	# Convert numeric values
	if idx == 3: # Qtde. Teórica column
	value = int(value.replace(".", ""))
	elif idx == 4: # Part. (%) column
	value = float(value.replace(",", "."))

	row[headers[idx]] = value
	rows.append(row)

	return rows


	def parse_table_pandas(table_html):
	"""
	Parse HTML table using pandas
	Returns a DataFrame with the parsed data
	"""
	# Read HTML table
	df = pd.read_html(f"<table>{table_html}</table>")[0]

	# Clean column names
	df.columns = [col.strip() for col in df.columns]

	# Convert 'Qtde. Teórica' to numeric, removing dots
	df["Qtde. Teórica"] = df["Qtde. Teórica"].apply(
	lambda x: int(str(x).replace(".", ""))
	)

	# Convert 'Part. (%)' to numeric, replacing comma with dot
	df["Part. (%)"] = df["Part. (%)"].apply(lambda x: float(str(x).replace(",", ".")))

	return df


	# Example usage
	def process_table(table_html, method="pandas"):
	try:
	if method == "pandas":
	data = parse_table_pandas(table_html)
	print(f"Successfully parsed {len(data)} rows using pandas")
	return data
	else:
	data = parse_table_beautifulsoup(table_html)
	print(f"Successfully parsed {len(data)} rows using BeautifulSoup")
	return data
	except Exception as e:
	print(f"Error parsing table: {str(e)}")
	return None


	b3_url = "https://sistemaswebb3-listados.b3.com.br/indexPage/day/IBXX?language=pt-br"

	with sync_playwright() as p:
	browser = p.chromium.launch()
	page = browser.new_page()
	page.goto(b3_url)

	# # wait for the table to load
	table = page.wait_for_selector("table.table-responsive-sm.table-responsive-md")
	select = page.locator("#selectPage")
	select.select_option("120")

	# wait a few seconds
	time.sleep(2)

	# wait for the table to load
	table = page.wait_for_selector("table.table-responsive-sm.table-responsive-md")
	table_html = table.inner_html()

	data = process_table(table_html, method="beautifulsoup")

	tickers = []
	weights = []

	for row in data:
	ticker = row.get("Código")
	weight = row.get("Part. (%)")

	tickers.append(ticker)
	weights.append(weight)

	print(tickers)
	print(f"Found {len(tickers)} tickers")

	browser.close()

	portfolio_id = create("IBX100")
	print(f"Created portfolio with id {portfolio_id}")

	remove_all(portfolio_id)
	print(f"Removed all assets from portfolio {portfolio_id}")

	ticker_map = get_ticker_map(tickers)

	asset_ids = [ticker_map[ticker] for ticker in tickers]

	add_assets(asset_ids, portfolio_id, weights)

	print(f"Added {len(asset_ids)} assets to portfolio")