Skip to content

Instantly share code, notes, and snippets.

@mohit
Created September 4, 2025 21:54
Show Gist options
  • Save mohit/d57723605b1b89fde99c809e7ccb0ba5 to your computer and use it in GitHub Desktop.
Save mohit/d57723605b1b89fde99c809e7ccb0ba5 to your computer and use it in GitHub Desktop.
Website Crawler using crawl4ai - Crawls websites with specified depth and saves content as markdown files with PDF downloads
#!/usr/bin/env python3
"""
Website Crawler using crawl4ai
Crawls websites with specified depth and saves content as markdown files.
Also downloads PDF files found during the crawl.
"""
import asyncio
import os
import re
import logging
from urllib.parse import urljoin, urlparse, quote
from pathlib import Path
from typing import Set, List, Dict
import aiohttp
import argparse
import sys
try:
from crawl4ai import AsyncWebCrawler
except ImportError:
print("crawl4ai not installed. Please run: pip install crawl4ai && crawl4ai-setup")
sys.exit(1)
class WebsiteCrawler:
def __init__(self, base_url: str, output_dir: str = "crawled_content",
max_depth: int = 2, max_pages: int = 100):
self.base_url = base_url.rstrip('/')
self.domain = urlparse(base_url).netloc
self.output_dir = Path(output_dir)
self.max_depth = max_depth
self.max_pages = max_pages
# Tracking sets
self.visited_urls: Set[str] = set()
self.pdf_urls: Set[str] = set()
self.crawled_pages = 0
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
self.logger = logging.getLogger(__name__)
# Create output directories
self.pages_dir = self.output_dir / "pages"
self.pdfs_dir = self.output_dir / "pdfs"
self.pages_dir.mkdir(parents=True, exist_ok=True)
self.pdfs_dir.mkdir(parents=True, exist_ok=True)
def is_same_domain(self, url: str) -> bool:
"""Check if URL belongs to the same domain"""
return urlparse(url).netloc == self.domain
def extract_links(self, html_content: str, base_url: str) -> List[str]:
"""Extract all links from HTML content"""
# Simple regex to find href attributes
href_pattern = r'href=["\']([^"\']+)["\']'
links = re.findall(href_pattern, html_content, re.IGNORECASE)
absolute_links = []
for link in links:
if link.startswith(('http://', 'https://')):
absolute_links.append(link)
elif link.startswith('/'):
absolute_links.append(urljoin(self.base_url, link))
elif not link.startswith(('mailto:', 'tel:', 'javascript:', '#')):
absolute_links.append(urljoin(base_url, link))
return list(set(absolute_links))
def extract_pdf_links(self, html_content: str, base_url: str) -> List[str]:
"""Extract PDF links from HTML content"""
pdf_pattern = r'href=["\']([^"\']*\.pdf[^"\']*)["\']'
pdf_links = re.findall(pdf_pattern, html_content, re.IGNORECASE)
absolute_pdf_links = []
for link in pdf_links:
if link.startswith(('http://', 'https://')):
absolute_pdf_links.append(link)
elif link.startswith('/'):
absolute_pdf_links.append(urljoin(self.base_url, link))
else:
absolute_pdf_links.append(urljoin(base_url, link))
return absolute_pdf_links
def sanitize_filename(self, url: str) -> str:
"""Convert URL to a safe filename"""
parsed = urlparse(url)
path = parsed.path.strip('/') or 'index'
# Replace problematic characters
filename = re.sub(r'[^\w\-_./]', '_', path)
filename = filename.replace('/', '_')
# Ensure .md extension
if not filename.endswith('.md'):
filename += '.md'
return filename
async def download_pdf(self, session: aiohttp.ClientSession, pdf_url: str):
"""Download a PDF file"""
try:
self.logger.info(f"Downloading PDF: {pdf_url}")
async with session.get(pdf_url) as response:
if response.status == 200:
content = await response.read()
# Generate filename from URL
filename = os.path.basename(urlparse(pdf_url).path)
if not filename or not filename.endswith('.pdf'):
filename = f"pdf_{len(self.pdf_urls)}.pdf"
pdf_path = self.pdfs_dir / filename
# Handle duplicate filenames
counter = 1
while pdf_path.exists():
name, ext = os.path.splitext(filename)
pdf_path = self.pdfs_dir / f"{name}_{counter}{ext}"
counter += 1
with open(pdf_path, 'wb') as f:
f.write(content)
self.logger.info(f"Downloaded PDF: {pdf_path}")
else:
self.logger.warning(f"Failed to download PDF {pdf_url}: {response.status}")
except Exception as e:
self.logger.error(f"Error downloading PDF {pdf_url}: {e}")
async def crawl_page(self, crawler: AsyncWebCrawler, session: aiohttp.ClientSession,
url: str, depth: int) -> List[str]:
"""Crawl a single page and return found links"""
if (url in self.visited_urls or
depth > self.max_depth or
self.crawled_pages >= self.max_pages or
not self.is_same_domain(url)):
return []
self.visited_urls.add(url)
self.crawled_pages += 1
self.logger.info(f"Crawling (depth {depth}): {url}")
try:
# Crawl the page
result = await crawler.arun(
url=url,
word_count_threshold=10,
bypass_cache=True
)
if not result.success:
self.logger.warning(f"Failed to crawl {url}: {result.error_message}")
return []
# Save markdown content
filename = self.sanitize_filename(url)
file_path = self.pages_dir / filename
# Handle duplicate filenames
counter = 1
while file_path.exists():
name, ext = os.path.splitext(filename)
file_path = self.pages_dir / f"{name}_{counter}{ext}"
counter += 1
# Create markdown content with metadata
page_title = getattr(result, 'title', None) or url.split('/')[-1] or 'Untitled'
content = f"# {page_title}\n\n"
content += f"**URL:** {url}\n"
content += f"**Success:** {result.success}\n\n"
content += "---\n\n"
content += result.markdown or "No content extracted"
with open(file_path, 'w', encoding='utf-8') as f:
f.write(content)
self.logger.info(f"Saved: {file_path}")
# Extract and download PDFs
if result.html:
pdf_links = self.extract_pdf_links(result.html, url)
for pdf_url in pdf_links:
if pdf_url not in self.pdf_urls:
self.pdf_urls.add(pdf_url)
await self.download_pdf(session, pdf_url)
# Extract links for further crawling
if depth < self.max_depth:
return self.extract_links(result.html, url)
return []
except Exception as e:
self.logger.error(f"Error crawling {url}: {e}")
return []
async def crawl_website(self):
"""Main crawling function"""
self.logger.info(f"Starting crawl of {self.base_url}")
self.logger.info(f"Max depth: {self.max_depth}, Max pages: {self.max_pages}")
self.logger.info(f"Output directory: {self.output_dir}")
# URLs to crawl at each depth level
current_level_urls = [self.base_url]
async with AsyncWebCrawler(verbose=False) as crawler:
async with aiohttp.ClientSession() as session:
for depth in range(self.max_depth + 1):
if not current_level_urls or self.crawled_pages >= self.max_pages:
break
self.logger.info(f"Crawling depth {depth} - {len(current_level_urls)} URLs")
next_level_urls = []
# Process URLs at current depth
for url in current_level_urls:
if self.crawled_pages >= self.max_pages:
break
found_links = await self.crawl_page(crawler, session, url, depth)
# Add new links for next depth level
for link in found_links:
if (link not in self.visited_urls and
self.is_same_domain(link) and
link not in next_level_urls):
next_level_urls.append(link)
current_level_urls = next_level_urls[:self.max_pages - self.crawled_pages]
self.logger.info(f"Crawling completed!")
self.logger.info(f"Total pages crawled: {self.crawled_pages}")
self.logger.info(f"Total PDFs downloaded: {len(self.pdf_urls)}")
self.logger.info(f"Content saved to: {self.output_dir}")
def main():
parser = argparse.ArgumentParser(description='Crawl a website and save content as markdown')
parser.add_argument('url', help='Base URL to crawl')
parser.add_argument('-o', '--output', default='crawled_content',
help='Output directory (default: crawled_content)')
parser.add_argument('-d', '--depth', type=int, default=2,
help='Maximum crawl depth (default: 2)')
parser.add_argument('-p', '--pages', type=int, default=100,
help='Maximum number of pages to crawl (default: 100)')
args = parser.parse_args()
crawler = WebsiteCrawler(
base_url=args.url,
output_dir=args.output,
max_depth=args.depth,
max_pages=args.pages
)
try:
asyncio.run(crawler.crawl_website())
except KeyboardInterrupt:
print("\nCrawling interrupted by user")
except Exception as e:
print(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
main()
@mohit
Copy link
Author

mohit commented Sep 4, 2025

Pre Requests

pip install crawl4ai>=0.7.0
pip install aiohttp>=3.8.0

Example Use

python website_crawler.py https://edd.ca.gov/en/disability/DI_Forms_and_Publications/ -d 2 -p 20

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment