Created
September 4, 2025 21:54
-
-
Save mohit/d57723605b1b89fde99c809e7ccb0ba5 to your computer and use it in GitHub Desktop.
Website Crawler using crawl4ai - Crawls websites with specified depth and saves content as markdown files with PDF downloads
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Website Crawler using crawl4ai | |
Crawls websites with specified depth and saves content as markdown files. | |
Also downloads PDF files found during the crawl. | |
""" | |
import asyncio | |
import os | |
import re | |
import logging | |
from urllib.parse import urljoin, urlparse, quote | |
from pathlib import Path | |
from typing import Set, List, Dict | |
import aiohttp | |
import argparse | |
import sys | |
try: | |
from crawl4ai import AsyncWebCrawler | |
except ImportError: | |
print("crawl4ai not installed. Please run: pip install crawl4ai && crawl4ai-setup") | |
sys.exit(1) | |
class WebsiteCrawler: | |
def __init__(self, base_url: str, output_dir: str = "crawled_content", | |
max_depth: int = 2, max_pages: int = 100): | |
self.base_url = base_url.rstrip('/') | |
self.domain = urlparse(base_url).netloc | |
self.output_dir = Path(output_dir) | |
self.max_depth = max_depth | |
self.max_pages = max_pages | |
# Tracking sets | |
self.visited_urls: Set[str] = set() | |
self.pdf_urls: Set[str] = set() | |
self.crawled_pages = 0 | |
# Setup logging | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
self.logger = logging.getLogger(__name__) | |
# Create output directories | |
self.pages_dir = self.output_dir / "pages" | |
self.pdfs_dir = self.output_dir / "pdfs" | |
self.pages_dir.mkdir(parents=True, exist_ok=True) | |
self.pdfs_dir.mkdir(parents=True, exist_ok=True) | |
def is_same_domain(self, url: str) -> bool: | |
"""Check if URL belongs to the same domain""" | |
return urlparse(url).netloc == self.domain | |
def extract_links(self, html_content: str, base_url: str) -> List[str]: | |
"""Extract all links from HTML content""" | |
# Simple regex to find href attributes | |
href_pattern = r'href=["\']([^"\']+)["\']' | |
links = re.findall(href_pattern, html_content, re.IGNORECASE) | |
absolute_links = [] | |
for link in links: | |
if link.startswith(('http://', 'https://')): | |
absolute_links.append(link) | |
elif link.startswith('/'): | |
absolute_links.append(urljoin(self.base_url, link)) | |
elif not link.startswith(('mailto:', 'tel:', 'javascript:', '#')): | |
absolute_links.append(urljoin(base_url, link)) | |
return list(set(absolute_links)) | |
def extract_pdf_links(self, html_content: str, base_url: str) -> List[str]: | |
"""Extract PDF links from HTML content""" | |
pdf_pattern = r'href=["\']([^"\']*\.pdf[^"\']*)["\']' | |
pdf_links = re.findall(pdf_pattern, html_content, re.IGNORECASE) | |
absolute_pdf_links = [] | |
for link in pdf_links: | |
if link.startswith(('http://', 'https://')): | |
absolute_pdf_links.append(link) | |
elif link.startswith('/'): | |
absolute_pdf_links.append(urljoin(self.base_url, link)) | |
else: | |
absolute_pdf_links.append(urljoin(base_url, link)) | |
return absolute_pdf_links | |
def sanitize_filename(self, url: str) -> str: | |
"""Convert URL to a safe filename""" | |
parsed = urlparse(url) | |
path = parsed.path.strip('/') or 'index' | |
# Replace problematic characters | |
filename = re.sub(r'[^\w\-_./]', '_', path) | |
filename = filename.replace('/', '_') | |
# Ensure .md extension | |
if not filename.endswith('.md'): | |
filename += '.md' | |
return filename | |
async def download_pdf(self, session: aiohttp.ClientSession, pdf_url: str): | |
"""Download a PDF file""" | |
try: | |
self.logger.info(f"Downloading PDF: {pdf_url}") | |
async with session.get(pdf_url) as response: | |
if response.status == 200: | |
content = await response.read() | |
# Generate filename from URL | |
filename = os.path.basename(urlparse(pdf_url).path) | |
if not filename or not filename.endswith('.pdf'): | |
filename = f"pdf_{len(self.pdf_urls)}.pdf" | |
pdf_path = self.pdfs_dir / filename | |
# Handle duplicate filenames | |
counter = 1 | |
while pdf_path.exists(): | |
name, ext = os.path.splitext(filename) | |
pdf_path = self.pdfs_dir / f"{name}_{counter}{ext}" | |
counter += 1 | |
with open(pdf_path, 'wb') as f: | |
f.write(content) | |
self.logger.info(f"Downloaded PDF: {pdf_path}") | |
else: | |
self.logger.warning(f"Failed to download PDF {pdf_url}: {response.status}") | |
except Exception as e: | |
self.logger.error(f"Error downloading PDF {pdf_url}: {e}") | |
async def crawl_page(self, crawler: AsyncWebCrawler, session: aiohttp.ClientSession, | |
url: str, depth: int) -> List[str]: | |
"""Crawl a single page and return found links""" | |
if (url in self.visited_urls or | |
depth > self.max_depth or | |
self.crawled_pages >= self.max_pages or | |
not self.is_same_domain(url)): | |
return [] | |
self.visited_urls.add(url) | |
self.crawled_pages += 1 | |
self.logger.info(f"Crawling (depth {depth}): {url}") | |
try: | |
# Crawl the page | |
result = await crawler.arun( | |
url=url, | |
word_count_threshold=10, | |
bypass_cache=True | |
) | |
if not result.success: | |
self.logger.warning(f"Failed to crawl {url}: {result.error_message}") | |
return [] | |
# Save markdown content | |
filename = self.sanitize_filename(url) | |
file_path = self.pages_dir / filename | |
# Handle duplicate filenames | |
counter = 1 | |
while file_path.exists(): | |
name, ext = os.path.splitext(filename) | |
file_path = self.pages_dir / f"{name}_{counter}{ext}" | |
counter += 1 | |
# Create markdown content with metadata | |
page_title = getattr(result, 'title', None) or url.split('/')[-1] or 'Untitled' | |
content = f"# {page_title}\n\n" | |
content += f"**URL:** {url}\n" | |
content += f"**Success:** {result.success}\n\n" | |
content += "---\n\n" | |
content += result.markdown or "No content extracted" | |
with open(file_path, 'w', encoding='utf-8') as f: | |
f.write(content) | |
self.logger.info(f"Saved: {file_path}") | |
# Extract and download PDFs | |
if result.html: | |
pdf_links = self.extract_pdf_links(result.html, url) | |
for pdf_url in pdf_links: | |
if pdf_url not in self.pdf_urls: | |
self.pdf_urls.add(pdf_url) | |
await self.download_pdf(session, pdf_url) | |
# Extract links for further crawling | |
if depth < self.max_depth: | |
return self.extract_links(result.html, url) | |
return [] | |
except Exception as e: | |
self.logger.error(f"Error crawling {url}: {e}") | |
return [] | |
async def crawl_website(self): | |
"""Main crawling function""" | |
self.logger.info(f"Starting crawl of {self.base_url}") | |
self.logger.info(f"Max depth: {self.max_depth}, Max pages: {self.max_pages}") | |
self.logger.info(f"Output directory: {self.output_dir}") | |
# URLs to crawl at each depth level | |
current_level_urls = [self.base_url] | |
async with AsyncWebCrawler(verbose=False) as crawler: | |
async with aiohttp.ClientSession() as session: | |
for depth in range(self.max_depth + 1): | |
if not current_level_urls or self.crawled_pages >= self.max_pages: | |
break | |
self.logger.info(f"Crawling depth {depth} - {len(current_level_urls)} URLs") | |
next_level_urls = [] | |
# Process URLs at current depth | |
for url in current_level_urls: | |
if self.crawled_pages >= self.max_pages: | |
break | |
found_links = await self.crawl_page(crawler, session, url, depth) | |
# Add new links for next depth level | |
for link in found_links: | |
if (link not in self.visited_urls and | |
self.is_same_domain(link) and | |
link not in next_level_urls): | |
next_level_urls.append(link) | |
current_level_urls = next_level_urls[:self.max_pages - self.crawled_pages] | |
self.logger.info(f"Crawling completed!") | |
self.logger.info(f"Total pages crawled: {self.crawled_pages}") | |
self.logger.info(f"Total PDFs downloaded: {len(self.pdf_urls)}") | |
self.logger.info(f"Content saved to: {self.output_dir}") | |
def main(): | |
parser = argparse.ArgumentParser(description='Crawl a website and save content as markdown') | |
parser.add_argument('url', help='Base URL to crawl') | |
parser.add_argument('-o', '--output', default='crawled_content', | |
help='Output directory (default: crawled_content)') | |
parser.add_argument('-d', '--depth', type=int, default=2, | |
help='Maximum crawl depth (default: 2)') | |
parser.add_argument('-p', '--pages', type=int, default=100, | |
help='Maximum number of pages to crawl (default: 100)') | |
args = parser.parse_args() | |
crawler = WebsiteCrawler( | |
base_url=args.url, | |
output_dir=args.output, | |
max_depth=args.depth, | |
max_pages=args.pages | |
) | |
try: | |
asyncio.run(crawler.crawl_website()) | |
except KeyboardInterrupt: | |
print("\nCrawling interrupted by user") | |
except Exception as e: | |
print(f"Error: {e}") | |
sys.exit(1) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Pre Requests
Example Use