Created
November 8, 2024 21:57
-
-
Save filipeandre/e771770fa47b46e820216c585b0d3939 to your computer and use it in GitHub Desktop.
Search for word recursively across all pages of a site
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| import re | |
| import time | |
| def is_valid_url(url, base_netloc): | |
| """ | |
| Check if the URL is valid and belongs to the same domain. | |
| """ | |
| parsed = urlparse(url) | |
| return parsed.scheme in ["http", "https"] and parsed.netloc == base_netloc | |
| def get_all_links(content, base_url): | |
| """ | |
| Extract all internal links from the page content. | |
| """ | |
| soup = BeautifulSoup(content, "html.parser") | |
| links = set() | |
| for a_tag in soup.find_all("a", href=True): | |
| href = a_tag['href'] | |
| full_url = urljoin(base_url, href) | |
| if is_valid_url(full_url, urlparse(base_url).netloc): | |
| links.add(full_url) | |
| return links | |
| def search_word_in_site(base_url, search_word): | |
| """ | |
| Crawl the website from base_url, searching for search_word in each page. | |
| """ | |
| visited = set() | |
| to_visit = set([base_url]) | |
| found_pages = [] | |
| while to_visit: | |
| current_url = to_visit.pop() | |
| if current_url in visited: | |
| continue | |
| print(f"Visiting: {current_url}") | |
| try: | |
| response = requests.get(current_url, timeout=10) | |
| response.raise_for_status() | |
| content = response.text | |
| # Search for the word, ignoring case | |
| if re.search(rf'\b{re.escape(search_word)}\b', content, re.IGNORECASE): | |
| print(f"--> Found '{search_word}' in {current_url}") | |
| found_pages.append(current_url) | |
| # Parse and collect new links | |
| links = get_all_links(content, base_url) | |
| to_visit.update(links - visited) | |
| except requests.RequestException as e: | |
| print(f"Error accessing {current_url}: {e}") | |
| visited.add(current_url) | |
| time.sleep(0.5) | |
| print("\nSearch complete.") | |
| if found_pages: | |
| print(f"The word '{search_word}' was found in the following pages:") | |
| for page in found_pages: | |
| print(f"- {page}") | |
| else: | |
| print(f"The word '{search_word}' was not found on the site.") | |
| if __name__ == "__main__": | |
| # Replace these with your target URL and search word | |
| base_url = "###" | |
| search_word = "###" | |
| search_word_in_site(base_url, search_word) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment