Skip to content

Instantly share code, notes, and snippets.

@KrishnanSriram
Created August 29, 2025 22:45
Show Gist options
  • Save KrishnanSriram/9e103d8f77ed1e47a5ec3ee3b660ba98 to your computer and use it in GitHub Desktop.
Save KrishnanSriram/9e103d8f77ed1e47a5ec3ee3b660ba98 to your computer and use it in GitHub Desktop.
Create a simple crew AI solution to scrap web pages
###############################################################################
# Screapper tool
###############################################################################
import requests
from bs4 import BeautifulSoup
from crewai.tools import BaseTool
from pydantic import BaseModel, Field
from typing import Type
class WebsiteContentScraperArgs(BaseModel):
"""Input model for the WebsiteContentScraper tool."""
website_url: str = Field(..., description="The full URL of the website to scrape.")
class WebsiteContentScraper(BaseTool):
"""
A tool to scrape the text content from a given website URL.
It uses requests to fetch the webpage and BeautifulSoup to extract the text.
"""
name: str = "Website Content Scraper"
description: str = "Extracts all visible text content from a specified website URL. Useful for getting the raw text from a webpage for analysis or summarization."
args_schema: Type[BaseModel] = WebsiteContentScraperArgs
def _run(self, website_url: str) -> str:
"""
The main execution method for the tool.
It fetches and parses the website content.
"""
try:
# Send a GET request to the website
response = requests.get(website_url, timeout=10, verify=False)
response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')
# Remove script and style elements
for script_or_style in soup(['script', 'style']):
script_or_style.decompose()
# Get the text content
text_content = soup.get_text(separator='\n', strip=True)
# Return a clean, concise string of the content
return f"Successfully scraped content from {website_url}:\n\n{text_content}"
except requests.exceptions.RequestException as e:
return f"Error: Could not retrieve the website content. An error occurred: {e}"
except Exception as e:
return f"An unexpected error occurred: {e}"
###############################################################################
# Summarizer Tool
###############################################################################
import requests
from crewai.tools import BaseTool
from crewai import LLM
from pydantic import BaseModel, Field
from typing import Type
llm = LLM(model="ollama/llama3.2", base_url="http://localhost:11434", temperature=0.3)
class TextSummarizerArgs(BaseModel):
"""Input model for the TextSummarizerTool."""
text_content: str = Field(..., description="The text content to be summarized.")
class TextSummarizerTool(BaseTool):
"""
A tool to summarize a long piece of text using Ollama and llama3.2.
"""
name: str = "Text Summarizer"
description: str = "Summarizes a given long text into a shorter, concise version (max 20 lines) using Ollama and the llama3.2 model."
args_schema: Type[BaseModel] = TextSummarizerArgs
def _run(self, text_content: str) -> str:
"""
The main execution method for the tool.
It sends the text to a local Ollama instance for summarization.
"""
try:
prompt = f"Please summarize the following text in no more than 20 lines:\n\n{text_content}"
summary = self.llm.invoke(prompt)
return f"Summary:\n\n{summary}"
except Exception as e:
return f"An unexpected error occurred during summarization: {e}"
###############################################################################
# Multiagent
###############################################################################
from openai import max_retries
from scrapper_tool import WebsiteContentScraper
from summarizer_tool import TextSummarizerTool
from crewai import Agent, Task, Crew, LLM
# Initialize the tool
scrape_tool = WebsiteContentScraper()
summarizer_tool = TextSummarizerTool()
llm = LLM(model="ollama/llama3.2", base_url="http://localhost:11434", temperature=0.3)
def main():
url_to_process = "https://www.zdnet.com/article/i-deciphered-apples-iphone-17-event-invite-my-3-biggest-theories-for-whats-expected/"
print(f"Processing URL: {url_to_process}\n")
# Agent 1: Scraper
# This agent is responsible only for scraping the website content.
scraper_agent = Agent(
role='Website Content Scraper',
goal=f'Scrape the full text content from the website: {url_to_process}',
backstory=(
"You are a diligent web scraper. Your only job is to take a URL "
"and use your tool to extract all the text from it. You do not analyze or summarize."
),
verbose=True,
allow_delegation=False,
tools=[scrape_tool],
llm=llm,
max_iter= 3,
max_retries= 3,
)
# Agent 2: Summarizer
# This agent is responsible only for summarizing text.
summarizer_agent = Agent(
role='Expert Text Summarizer',
goal='Summarize the text content provided to you into a concise, 20-line summary.',
backstory=(
"You are a master of summarization. You take long pieces of text and distill them "
"into their most important parts, creating a short and easy-to-read summary."
),
verbose=True,
allow_delegation=False,
tools=[summarizer_tool],
llm=llm
)
# --- 4. Create Tasks for the Agents ---
# Task for the scraper agent
scrape_task = Task(
description=f'Use the Website Content Scraper tool to scrape the full text content from the URL: {url_to_process}.',
expected_output='The complete text content of the webpage, ready for the summarizer.',
agent=scraper_agent
)
# Task for the summarizer agent
summarize_task = Task(
description='Take the scraped text content from the previous task and use the Text Summarizer tool to create a concise summary of no more than 20 lines.',
expected_output='A summary of the article in 20 lines or less.',
agent=summarizer_agent,
context=[scrape_task] # This ensures this task runs after scrape_task and uses its output
)
# --- 5. Create and Run the Crew ---
website_research_crew = Crew(
agents=[scraper_agent], #, summarizer_agent], # Include both agents in the crew
tasks=[scrape_task], #, summarize_task],
verbose=True
)
print("\n--- Running Crew to Scrape and Summarize ---")
result = website_research_crew.kickoff()
print("\n\n--------------------")
print("--- CrewAI Process Complete ---")
print("Final Result:")
print(result)
print("--------------------")
# Example of how to use the tool (for testing purposes)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment