Skip to content

Instantly share code, notes, and snippets.

@filipeandre
Created November 8, 2024 21:57
Show Gist options
  • Save filipeandre/e771770fa47b46e820216c585b0d3939 to your computer and use it in GitHub Desktop.
Save filipeandre/e771770fa47b46e820216c585b0d3939 to your computer and use it in GitHub Desktop.
Search for word recursively across all pages of a site
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import re
import time
def is_valid_url(url, base_netloc):
"""
Check if the URL is valid and belongs to the same domain.
"""
parsed = urlparse(url)
return parsed.scheme in ["http", "https"] and parsed.netloc == base_netloc
def get_all_links(content, base_url):
"""
Extract all internal links from the page content.
"""
soup = BeautifulSoup(content, "html.parser")
links = set()
for a_tag in soup.find_all("a", href=True):
href = a_tag['href']
full_url = urljoin(base_url, href)
if is_valid_url(full_url, urlparse(base_url).netloc):
links.add(full_url)
return links
def search_word_in_site(base_url, search_word):
"""
Crawl the website from base_url, searching for search_word in each page.
"""
visited = set()
to_visit = set([base_url])
found_pages = []
while to_visit:
current_url = to_visit.pop()
if current_url in visited:
continue
print(f"Visiting: {current_url}")
try:
response = requests.get(current_url, timeout=10)
response.raise_for_status()
content = response.text
# Search for the word, ignoring case
if re.search(rf'\b{re.escape(search_word)}\b', content, re.IGNORECASE):
print(f"--> Found '{search_word}' in {current_url}")
found_pages.append(current_url)
# Parse and collect new links
links = get_all_links(content, base_url)
to_visit.update(links - visited)
except requests.RequestException as e:
print(f"Error accessing {current_url}: {e}")
visited.add(current_url)
time.sleep(0.5)
print("\nSearch complete.")
if found_pages:
print(f"The word '{search_word}' was found in the following pages:")
for page in found_pages:
print(f"- {page}")
else:
print(f"The word '{search_word}' was not found on the site.")
if __name__ == "__main__":
# Replace these with your target URL and search word
base_url = "###"
search_word = "###"
search_word_in_site(base_url, search_word)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment