feliche93 · August 28, 2023 08:38
diff --git a/app.py b/app.py
 """
 This module defines the FastAPI application and its endpoints.
 It includes the endpoint for scraping a website and potentially an endpoint for finding contacts.
 The application is wrapped with a stub function for deployment.
 """

 from typing import Any

 from common import ENV, image, secret, stub
 from fastapi import FastAPI
 from modal import asgi_app
 from models import ScrapeWebsiteRequest, WebsiteContentOutputSchema
 from scraper import get_website_content


 web_app = FastAPI()


 @web_app.post("/scrape-website", response_model=WebsiteContentOutputSchema)
 async def scrape_website(request: ScrapeWebsiteRequest) -> Any:
    """
    This function scrapes the website content based on the provided URL.

    Args:
        request (ScrapeWebsiteRequest): The request object containing
        the URL of the website to be scraped.

    Returns:
        WebsiteContentOutputSchema: The response object containing the scraped website content.
    """
    content = await get_website_content(request.url)

    if request.keyword:
        content = content.copy(update={"keyword": request.keyword})

    return content


 @stub.function(image=image, secret=secret)
 @asgi_app(label=f"backlinkgpt-fast-api-{ENV}")
 def fastapi_app():
    """
    This function returns the FastAPI application instance.

    Returns:
        FastAPI: The FastAPI application instance.
    """
    return web_app
diff --git a/scraper.py b/scraper.py
 """
 This module contains functions for web scraping,
 including converting HTML content to BeautifulSoup objects and markdown,
 and getting website content using playwright.
 """

 from bs4 import BeautifulSoup
 from html2text import HTML2Text
 from langchain.tools import tool
 from models import GetWebsiteContentSchema, WebsiteContentOutputSchema
 from playwright.async_api import async_playwright


 def convert_content_to_soup(content: str) -> BeautifulSoup:
    """Convert html content to soup

    Args:
        content (str): html content

    Returns:
        BeautifulSoup: soup
    """
    soup = BeautifulSoup(content, "html.parser")
    return soup


 def convert_content_to_markdown(content: str) -> str:
    """Convert soup to markdown

    Args:
        soup (BeautifulSoup): soup

    Returns:
        str: markdown
    """
    text_maker = HTML2Text()
    markdown = text_maker.handle(content)
    return markdown


 @tool(return_direct=False, args_schema=GetWebsiteContentSchema)
 async def get_website_content(url: str) -> WebsiteContentOutputSchema:
    """Use this to get the text content of a website."""
    async with async_playwright() as p:  # pylint: disable=invalid-name
        # can be used for local debugging in jupyter notebook
        # p = await async_playwright().start()
        # browser = await p.chromium.launch(headless=False)

        browser = await p.chromium.launch()
        page = await browser.new_page()

        print(f"Goto {url}")
        await page.goto(url)

        # get page content
        content = await page.content()

        await browser.close()

        # parse with BeautifulSoup
        soup = convert_content_to_soup(content)

        # body_text
        body_text = convert_content_to_markdown(content=content)

        # page_title
        page_title = soup.find("title").text

        # meta_title
        meta_title = soup.find("meta", property="og:title")
        meta_title = meta_title["content"] if meta_title else None

        # meta_description
        meta_description = soup.find("meta", property="og:description")
        meta_description = meta_description["content"] if meta_description else None

        # meta_image_url
        meta_image_url = soup.find("meta", property="og:image")
        meta_image_url = meta_image_url["content"] if meta_image_url else None

        # favicon_image_url
        favicon_image_url = soup.find("link", rel="icon")
        favicon_image_url = url + favicon_image_url["href"] if favicon_image_url else None

    print(f"Crawled {url}")

    return WebsiteContentOutputSchema(
        bodyText=body_text,
        pageTitle=page_title,
        metaTitle=meta_title,
        metaDescription=meta_description,
        metaImageUrl=meta_image_url,
        faviconImageUrl=favicon_image_url,
        url=url,
    )
	"""
	This module defines the FastAPI application and its endpoints.
	It includes the endpoint for scraping a website and potentially an endpoint for finding contacts.
	The application is wrapped with a stub function for deployment.
	"""

	from typing import Any

	from common import ENV, image, secret, stub
	from fastapi import FastAPI
	from modal import asgi_app
	from models import ScrapeWebsiteRequest, WebsiteContentOutputSchema
	from scraper import get_website_content


	web_app = FastAPI()


	@web_app.post("/scrape-website", response_model=WebsiteContentOutputSchema)
	async def scrape_website(request: ScrapeWebsiteRequest) -> Any:
	"""
	This function scrapes the website content based on the provided URL.

	Args:
	request (ScrapeWebsiteRequest): The request object containing
	the URL of the website to be scraped.

	Returns:
	WebsiteContentOutputSchema: The response object containing the scraped website content.
	"""
	content = await get_website_content(request.url)

	if request.keyword:
	content = content.copy(update={"keyword": request.keyword})

	return content


	@stub.function(image=image, secret=secret)
	@asgi_app(label=f"backlinkgpt-fast-api-{ENV}")
	def fastapi_app():
	"""
	This function returns the FastAPI application instance.

	Returns:
	FastAPI: The FastAPI application instance.
	"""
	return web_app
	"""
	This module contains functions for web scraping,
	including converting HTML content to BeautifulSoup objects and markdown,
	and getting website content using playwright.
	"""

	from bs4 import BeautifulSoup
	from html2text import HTML2Text
	from langchain.tools import tool
	from models import GetWebsiteContentSchema, WebsiteContentOutputSchema
	from playwright.async_api import async_playwright


	def convert_content_to_soup(content: str) -> BeautifulSoup:
	"""Convert html content to soup

	Args:
	content (str): html content

	Returns:
	BeautifulSoup: soup
	"""
	soup = BeautifulSoup(content, "html.parser")
	return soup


	def convert_content_to_markdown(content: str) -> str:
	"""Convert soup to markdown

	Args:
	soup (BeautifulSoup): soup

	Returns:
	str: markdown
	"""
	text_maker = HTML2Text()
	markdown = text_maker.handle(content)
	return markdown


	@tool(return_direct=False, args_schema=GetWebsiteContentSchema)
	async def get_website_content(url: str) -> WebsiteContentOutputSchema:
	"""Use this to get the text content of a website."""
	async with async_playwright() as p: # pylint: disable=invalid-name
	# can be used for local debugging in jupyter notebook
	# p = await async_playwright().start()
	# browser = await p.chromium.launch(headless=False)

	browser = await p.chromium.launch()
	page = await browser.new_page()

	print(f"Goto {url}")
	await page.goto(url)

	# get page content
	content = await page.content()

	await browser.close()

	# parse with BeautifulSoup
	soup = convert_content_to_soup(content)

	# body_text
	body_text = convert_content_to_markdown(content=content)

	# page_title
	page_title = soup.find("title").text

	# meta_title
	meta_title = soup.find("meta", property="og:title")
	meta_title = meta_title["content"] if meta_title else None

	# meta_description
	meta_description = soup.find("meta", property="og:description")
	meta_description = meta_description["content"] if meta_description else None

	# meta_image_url
	meta_image_url = soup.find("meta", property="og:image")
	meta_image_url = meta_image_url["content"] if meta_image_url else None

	# favicon_image_url
	favicon_image_url = soup.find("link", rel="icon")
	favicon_image_url = url + favicon_image_url["href"] if favicon_image_url else None

	print(f"Crawled {url}")

	return WebsiteContentOutputSchema(
	bodyText=body_text,
	pageTitle=page_title,
	metaTitle=meta_title,
	metaDescription=meta_description,
	metaImageUrl=meta_image_url,
	faviconImageUrl=favicon_image_url,
	url=url,
	)