Last active
August 28, 2023 08:38
-
-
Save feliche93/eac74c77db4227069891f66fef20d370 to your computer and use it in GitHub Desktop.
FastAPI App for Webscraping on Modal.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This module defines the FastAPI application and its endpoints. | |
It includes the endpoint for scraping a website and potentially an endpoint for finding contacts. | |
The application is wrapped with a stub function for deployment. | |
""" | |
from typing import Any | |
from common import ENV, image, secret, stub | |
from fastapi import FastAPI | |
from modal import asgi_app | |
from models import ScrapeWebsiteRequest, WebsiteContentOutputSchema | |
from scraper import get_website_content | |
web_app = FastAPI() | |
@web_app.post("/scrape-website", response_model=WebsiteContentOutputSchema) | |
async def scrape_website(request: ScrapeWebsiteRequest) -> Any: | |
""" | |
This function scrapes the website content based on the provided URL. | |
Args: | |
request (ScrapeWebsiteRequest): The request object containing | |
the URL of the website to be scraped. | |
Returns: | |
WebsiteContentOutputSchema: The response object containing the scraped website content. | |
""" | |
content = await get_website_content(request.url) | |
if request.keyword: | |
content = content.copy(update={"keyword": request.keyword}) | |
return content | |
@stub.function(image=image, secret=secret) | |
@asgi_app(label=f"backlinkgpt-fast-api-{ENV}") | |
def fastapi_app(): | |
""" | |
This function returns the FastAPI application instance. | |
Returns: | |
FastAPI: The FastAPI application instance. | |
""" | |
return web_app |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This module contains functions for web scraping, | |
including converting HTML content to BeautifulSoup objects and markdown, | |
and getting website content using playwright. | |
""" | |
from bs4 import BeautifulSoup | |
from html2text import HTML2Text | |
from langchain.tools import tool | |
from models import GetWebsiteContentSchema, WebsiteContentOutputSchema | |
from playwright.async_api import async_playwright | |
def convert_content_to_soup(content: str) -> BeautifulSoup: | |
"""Convert html content to soup | |
Args: | |
content (str): html content | |
Returns: | |
BeautifulSoup: soup | |
""" | |
soup = BeautifulSoup(content, "html.parser") | |
return soup | |
def convert_content_to_markdown(content: str) -> str: | |
"""Convert soup to markdown | |
Args: | |
soup (BeautifulSoup): soup | |
Returns: | |
str: markdown | |
""" | |
text_maker = HTML2Text() | |
markdown = text_maker.handle(content) | |
return markdown | |
@tool(return_direct=False, args_schema=GetWebsiteContentSchema) | |
async def get_website_content(url: str) -> WebsiteContentOutputSchema: | |
"""Use this to get the text content of a website.""" | |
async with async_playwright() as p: # pylint: disable=invalid-name | |
# can be used for local debugging in jupyter notebook | |
# p = await async_playwright().start() | |
# browser = await p.chromium.launch(headless=False) | |
browser = await p.chromium.launch() | |
page = await browser.new_page() | |
print(f"Goto {url}") | |
await page.goto(url) | |
# get page content | |
content = await page.content() | |
await browser.close() | |
# parse with BeautifulSoup | |
soup = convert_content_to_soup(content) | |
# body_text | |
body_text = convert_content_to_markdown(content=content) | |
# page_title | |
page_title = soup.find("title").text | |
# meta_title | |
meta_title = soup.find("meta", property="og:title") | |
meta_title = meta_title["content"] if meta_title else None | |
# meta_description | |
meta_description = soup.find("meta", property="og:description") | |
meta_description = meta_description["content"] if meta_description else None | |
# meta_image_url | |
meta_image_url = soup.find("meta", property="og:image") | |
meta_image_url = meta_image_url["content"] if meta_image_url else None | |
# favicon_image_url | |
favicon_image_url = soup.find("link", rel="icon") | |
favicon_image_url = url + favicon_image_url["href"] if favicon_image_url else None | |
print(f"Crawled {url}") | |
return WebsiteContentOutputSchema( | |
bodyText=body_text, | |
pageTitle=page_title, | |
metaTitle=meta_title, | |
metaDescription=meta_description, | |
metaImageUrl=meta_image_url, | |
faviconImageUrl=favicon_image_url, | |
url=url, | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment