Created
August 29, 2025 22:45
-
-
Save KrishnanSriram/9e103d8f77ed1e47a5ec3ee3b660ba98 to your computer and use it in GitHub Desktop.
Create a simple crew AI solution to scrap web pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ############################################################################### | |
| # Screapper tool | |
| ############################################################################### | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from crewai.tools import BaseTool | |
| from pydantic import BaseModel, Field | |
| from typing import Type | |
| class WebsiteContentScraperArgs(BaseModel): | |
| """Input model for the WebsiteContentScraper tool.""" | |
| website_url: str = Field(..., description="The full URL of the website to scrape.") | |
| class WebsiteContentScraper(BaseTool): | |
| """ | |
| A tool to scrape the text content from a given website URL. | |
| It uses requests to fetch the webpage and BeautifulSoup to extract the text. | |
| """ | |
| name: str = "Website Content Scraper" | |
| description: str = "Extracts all visible text content from a specified website URL. Useful for getting the raw text from a webpage for analysis or summarization." | |
| args_schema: Type[BaseModel] = WebsiteContentScraperArgs | |
| def _run(self, website_url: str) -> str: | |
| """ | |
| The main execution method for the tool. | |
| It fetches and parses the website content. | |
| """ | |
| try: | |
| # Send a GET request to the website | |
| response = requests.get(website_url, timeout=10, verify=False) | |
| response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx) | |
| # Parse the HTML content of the page | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| # Remove script and style elements | |
| for script_or_style in soup(['script', 'style']): | |
| script_or_style.decompose() | |
| # Get the text content | |
| text_content = soup.get_text(separator='\n', strip=True) | |
| # Return a clean, concise string of the content | |
| return f"Successfully scraped content from {website_url}:\n\n{text_content}" | |
| except requests.exceptions.RequestException as e: | |
| return f"Error: Could not retrieve the website content. An error occurred: {e}" | |
| except Exception as e: | |
| return f"An unexpected error occurred: {e}" | |
| ############################################################################### | |
| # Summarizer Tool | |
| ############################################################################### | |
| import requests | |
| from crewai.tools import BaseTool | |
| from crewai import LLM | |
| from pydantic import BaseModel, Field | |
| from typing import Type | |
| llm = LLM(model="ollama/llama3.2", base_url="http://localhost:11434", temperature=0.3) | |
| class TextSummarizerArgs(BaseModel): | |
| """Input model for the TextSummarizerTool.""" | |
| text_content: str = Field(..., description="The text content to be summarized.") | |
| class TextSummarizerTool(BaseTool): | |
| """ | |
| A tool to summarize a long piece of text using Ollama and llama3.2. | |
| """ | |
| name: str = "Text Summarizer" | |
| description: str = "Summarizes a given long text into a shorter, concise version (max 20 lines) using Ollama and the llama3.2 model." | |
| args_schema: Type[BaseModel] = TextSummarizerArgs | |
| def _run(self, text_content: str) -> str: | |
| """ | |
| The main execution method for the tool. | |
| It sends the text to a local Ollama instance for summarization. | |
| """ | |
| try: | |
| prompt = f"Please summarize the following text in no more than 20 lines:\n\n{text_content}" | |
| summary = self.llm.invoke(prompt) | |
| return f"Summary:\n\n{summary}" | |
| except Exception as e: | |
| return f"An unexpected error occurred during summarization: {e}" | |
| ############################################################################### | |
| # Multiagent | |
| ############################################################################### | |
| from openai import max_retries | |
| from scrapper_tool import WebsiteContentScraper | |
| from summarizer_tool import TextSummarizerTool | |
| from crewai import Agent, Task, Crew, LLM | |
| # Initialize the tool | |
| scrape_tool = WebsiteContentScraper() | |
| summarizer_tool = TextSummarizerTool() | |
| llm = LLM(model="ollama/llama3.2", base_url="http://localhost:11434", temperature=0.3) | |
| def main(): | |
| url_to_process = "https://www.zdnet.com/article/i-deciphered-apples-iphone-17-event-invite-my-3-biggest-theories-for-whats-expected/" | |
| print(f"Processing URL: {url_to_process}\n") | |
| # Agent 1: Scraper | |
| # This agent is responsible only for scraping the website content. | |
| scraper_agent = Agent( | |
| role='Website Content Scraper', | |
| goal=f'Scrape the full text content from the website: {url_to_process}', | |
| backstory=( | |
| "You are a diligent web scraper. Your only job is to take a URL " | |
| "and use your tool to extract all the text from it. You do not analyze or summarize." | |
| ), | |
| verbose=True, | |
| allow_delegation=False, | |
| tools=[scrape_tool], | |
| llm=llm, | |
| max_iter= 3, | |
| max_retries= 3, | |
| ) | |
| # Agent 2: Summarizer | |
| # This agent is responsible only for summarizing text. | |
| summarizer_agent = Agent( | |
| role='Expert Text Summarizer', | |
| goal='Summarize the text content provided to you into a concise, 20-line summary.', | |
| backstory=( | |
| "You are a master of summarization. You take long pieces of text and distill them " | |
| "into their most important parts, creating a short and easy-to-read summary." | |
| ), | |
| verbose=True, | |
| allow_delegation=False, | |
| tools=[summarizer_tool], | |
| llm=llm | |
| ) | |
| # --- 4. Create Tasks for the Agents --- | |
| # Task for the scraper agent | |
| scrape_task = Task( | |
| description=f'Use the Website Content Scraper tool to scrape the full text content from the URL: {url_to_process}.', | |
| expected_output='The complete text content of the webpage, ready for the summarizer.', | |
| agent=scraper_agent | |
| ) | |
| # Task for the summarizer agent | |
| summarize_task = Task( | |
| description='Take the scraped text content from the previous task and use the Text Summarizer tool to create a concise summary of no more than 20 lines.', | |
| expected_output='A summary of the article in 20 lines or less.', | |
| agent=summarizer_agent, | |
| context=[scrape_task] # This ensures this task runs after scrape_task and uses its output | |
| ) | |
| # --- 5. Create and Run the Crew --- | |
| website_research_crew = Crew( | |
| agents=[scraper_agent], #, summarizer_agent], # Include both agents in the crew | |
| tasks=[scrape_task], #, summarize_task], | |
| verbose=True | |
| ) | |
| print("\n--- Running Crew to Scrape and Summarize ---") | |
| result = website_research_crew.kickoff() | |
| print("\n\n--------------------") | |
| print("--- CrewAI Process Complete ---") | |
| print("Final Result:") | |
| print(result) | |
| print("--------------------") | |
| # Example of how to use the tool (for testing purposes) | |
| if __name__ == '__main__': | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment