Skip to content

Instantly share code, notes, and snippets.

@svpino
Created May 22, 2023 11:40
Show Gist options
  • Save svpino/dfa7b75f6bf68479b56d495a3575cd94 to your computer and use it in GitHub Desktop.
Save svpino/dfa7b75f6bf68479b56d495a3575cd94 to your computer and use it in GitHub Desktop.
Scraping browser example
import re
import asyncio
import os
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from playwright.async_api import async_playwright
USERNAME = "USERNAME HERE"
PASSWORD = "PASSWORD HERE"
HOST = "zproxy.lum-superproxy.io:9222"
async def get(uri, parse_fn):
print("Loading", uri)
browser_url = f"https://{USERNAME}:{PASSWORD}@{HOST}"
async with async_playwright() as pw:
browser = await pw.chromium.connect_over_cdp(browser_url)
page = await browser.new_page()
await page.goto(uri, timeout=120000)
try:
result = await parse_fn(await page.evaluate("()=>document.documentElement.outerHTML"))
await browser.close()
return result
except Exception as e:
print(e)
def parse_stock_price(html):
soup = BeautifulSoup(html, "html.parser")
element = soup.find("fin-streamer", attrs={
"data-test": "qsp-price"}
)
if element is not None:
return element.get("value")
return "N/A"
async def parse_top_stocks(html):
soup = BeautifulSoup(html, "html.parser")
element = soup.find("table", attrs={
"class": "mdc-table__content"
})
span_elements = element.find_all("span", attrs={"class": "article__table-subunit"})
for span in span_elements:
if isinstance(span.contents[0], NavigableString):
ticker = span.getText()
price = await get(
uri=f"https://finance.yahoo.com/quote/{ticker}",
parse_fn=parse_stock_price
)
print(ticker, price)
asyncio.run(get(
uri="https://www.morningstar.com/articles/1091882/the-10-best-companies-to-invest-in-now",
parse_fn=parse_top_stocks
))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment