filipeandre · November 8, 2024 21:57
diff --git a/search.py b/search.py
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
 import re
 import time

 def is_valid_url(url, base_netloc):
    """
    Check if the URL is valid and belongs to the same domain.
    """
    parsed = urlparse(url)
    return parsed.scheme in ["http", "https"] and parsed.netloc == base_netloc

 def get_all_links(content, base_url):
    """
    Extract all internal links from the page content.
    """
    soup = BeautifulSoup(content, "html.parser")
    links = set()
    for a_tag in soup.find_all("a", href=True):
        href = a_tag['href']
        full_url = urljoin(base_url, href)
        if is_valid_url(full_url, urlparse(base_url).netloc):
            links.add(full_url)
    return links

 def search_word_in_site(base_url, search_word):
    """
    Crawl the website from base_url, searching for search_word in each page.
    """
    visited = set()
    to_visit = set([base_url])
    found_pages = []

    while to_visit:
        current_url = to_visit.pop()
        if current_url in visited:
            continue
        print(f"Visiting: {current_url}")
        try:
            response = requests.get(current_url, timeout=10)
            response.raise_for_status()
            content = response.text

            # Search for the word, ignoring case
            if re.search(rf'\b{re.escape(search_word)}\b', content, re.IGNORECASE):
                print(f"--> Found '{search_word}' in {current_url}")
                found_pages.append(current_url)

            # Parse and collect new links
            links = get_all_links(content, base_url)
            to_visit.update(links - visited)

        except requests.RequestException as e:
            print(f"Error accessing {current_url}: {e}")
        visited.add(current_url)
        time.sleep(0.5)

    print("\nSearch complete.")
    if found_pages:
        print(f"The word '{search_word}' was found in the following pages:")
        for page in found_pages:
            print(f"- {page}")
    else:
        print(f"The word '{search_word}' was not found on the site.")

 if __name__ == "__main__":
    # Replace these with your target URL and search word
    base_url = "###"
    search_word = "###"

    search_word_in_site(base_url, search_word)
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import re
	import time

	def is_valid_url(url, base_netloc):
	"""
	Check if the URL is valid and belongs to the same domain.
	"""
	parsed = urlparse(url)
	return parsed.scheme in ["http", "https"] and parsed.netloc == base_netloc

	def get_all_links(content, base_url):
	"""
	Extract all internal links from the page content.
	"""
	soup = BeautifulSoup(content, "html.parser")
	links = set()
	for a_tag in soup.find_all("a", href=True):
	href = a_tag['href']
	full_url = urljoin(base_url, href)
	if is_valid_url(full_url, urlparse(base_url).netloc):
	links.add(full_url)
	return links

	def search_word_in_site(base_url, search_word):
	"""
	Crawl the website from base_url, searching for search_word in each page.
	"""
	visited = set()
	to_visit = set([base_url])
	found_pages = []

	while to_visit:
	current_url = to_visit.pop()
	if current_url in visited:
	continue
	print(f"Visiting: {current_url}")
	try:
	response = requests.get(current_url, timeout=10)
	response.raise_for_status()
	content = response.text

	# Search for the word, ignoring case
	if re.search(rf'\b{re.escape(search_word)}\b', content, re.IGNORECASE):
	print(f"--> Found '{search_word}' in {current_url}")
	found_pages.append(current_url)

	# Parse and collect new links
	links = get_all_links(content, base_url)
	to_visit.update(links - visited)

	except requests.RequestException as e:
	print(f"Error accessing {current_url}: {e}")
	visited.add(current_url)
	time.sleep(0.5)

	print("\nSearch complete.")
	if found_pages:
	print(f"The word '{search_word}' was found in the following pages:")
	for page in found_pages:
	print(f"- {page}")
	else:
	print(f"The word '{search_word}' was not found on the site.")

	if __name__ == "__main__":
	# Replace these with your target URL and search word
	base_url = "###"
	search_word = "###"

	search_word_in_site(base_url, search_word)