nullenc0de · August 30, 2022 19:18
diff --git a/email_scrape.py b/email_scrape.py
 import re
 import csv
 import time
 import urllib.request

 # Enter the URL you want to start from
 url = input("Enter the starting URL: ")

 # This is where we store the links we've already visited
 visited = set()

 # This is where we store the emails we've found
 emails = set()

 # Let the user know what's going on
 print("Scanning for email addresses...")

 # We want to keep track of how many links we've visited
 link_count = 0
 queue = []

 # We also want to keep track of how many emails we've found
 email_count = 0

 # This is a regular expression for finding email addresses
 email_regex = re.compile(r"[a-z0-9\.\-\_\+]+@[a-z0-9\-\_]+(\.[a-z0-9\-\_]+){1,}", re.I)

 # This is a regular expression for finding links
 link_regex = re.compile(r"<a [^>]*href=['\"]([^'\"]+)", re.I)

 # This is a regular expression for finding the domain of a link
 domain_regex = re.compile(r"^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/\n]+)", re.I)

 # This is the domain we're interested in
 domain = domain_regex.match(url).group(1)

 # We use this to keep track of when we should display a status update
 last_time = time.time()

 # This is the main loop
 while True:
    # Try to request the URL
    try:
        # Get the URL's contents
        response = urllib.request.urlopen(url)

        # Convert the response to text
        html = response.read().decode("utf-8")
    except:
        # Something went wrong, so just skip this URL
        print("Error fetching %s" % url)
        break

    # We visited one more link
    link_count += 1

    # Update the user every 10 links
    if link_count % 10 == 0 and time.time() - last_time > 1:
        print("Found %s email addresses in %s links" % (email_count, link_count))
        last_time = time.time()

    # Find all the email addresses in the response
    for match in email_regex.finditer(html):
        # We found one more email address
        email_count += 1

        # Add it to our list
        emails.add(match.group(0))

    # Find all the links in the response
    for match in link_regex.finditer(html):
        # Get the link
        link = match.group(1)

        # If it's not a link to our domain, skip it
        if not domain_regex.match(link):
            continue

        # If we haven't visited it yet, add it to the queue
        if link not in visited:
            # Mark it as visited
            visited.add(link)

            # Add it to the queue
            queue.append(link)

    # If we're out of links, we're done
    if len(queue) == 0:
        break

    # Get the next link in the queue
    url = queue.pop(0)

 # Let the user know we're done
 print("Done! Found %s email addresses in %s links" % (email_count, link_count))

 # Ask the user if they want to save the results
 save = input("Do you want to save the results to a CSV file? (y/n) ")

 # If they want to save the results
 if save.lower() == "y":
    # Get a filename from the user
    filename = input("Enter a filename: ")

    # Open the file for writing
    with open(filename, "w") as file:
        # Create a CSV writer
        writer = csv.writer(file)

        # Write the emails to the CSV file
        for email in emails:
            writer.writerow([email])
	import re
	import csv
	import time
	import urllib.request

	# Enter the URL you want to start from
	url = input("Enter the starting URL: ")

	# This is where we store the links we've already visited
	visited = set()

	# This is where we store the emails we've found
	emails = set()

	# Let the user know what's going on
	print("Scanning for email addresses...")

	# We want to keep track of how many links we've visited
	link_count = 0
	queue = []

	# We also want to keep track of how many emails we've found
	email_count = 0

	# This is a regular expression for finding email addresses
	email_regex = re.compile(r"[a-z0-9\.\-\_\+]+@[a-z0-9\-\_]+(\.[a-z0-9\-\_]+){1,}", re.I)

	# This is a regular expression for finding links
	link_regex = re.compile(r"<a [^>]*href=['\"]([^'\"]+)", re.I)

	# This is a regular expression for finding the domain of a link
	domain_regex = re.compile(r"^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/\n]+)", re.I)

	# This is the domain we're interested in
	domain = domain_regex.match(url).group(1)

	# We use this to keep track of when we should display a status update
	last_time = time.time()

	# This is the main loop
	while True:
	# Try to request the URL
	try:
	# Get the URL's contents
	response = urllib.request.urlopen(url)

	# Convert the response to text
	html = response.read().decode("utf-8")
	except:
	# Something went wrong, so just skip this URL
	print("Error fetching %s" % url)
	break

	# We visited one more link
	link_count += 1

	# Update the user every 10 links
	if link_count % 10 == 0 and time.time() - last_time > 1:
	print("Found %s email addresses in %s links" % (email_count, link_count))
	last_time = time.time()

	# Find all the email addresses in the response
	for match in email_regex.finditer(html):
	# We found one more email address
	email_count += 1

	# Add it to our list
	emails.add(match.group(0))

	# Find all the links in the response
	for match in link_regex.finditer(html):
	# Get the link
	link = match.group(1)

	# If it's not a link to our domain, skip it
	if not domain_regex.match(link):
	continue

	# If we haven't visited it yet, add it to the queue
	if link not in visited:
	# Mark it as visited
	visited.add(link)

	# Add it to the queue
	queue.append(link)

	# If we're out of links, we're done
	if len(queue) == 0:
	break

	# Get the next link in the queue
	url = queue.pop(0)

	# Let the user know we're done
	print("Done! Found %s email addresses in %s links" % (email_count, link_count))

	# Ask the user if they want to save the results
	save = input("Do you want to save the results to a CSV file? (y/n) ")

	# If they want to save the results
	if save.lower() == "y":
	# Get a filename from the user
	filename = input("Enter a filename: ")

	# Open the file for writing
	with open(filename, "w") as file:
	# Create a CSV writer
	writer = csv.writer(file)

	# Write the emails to the CSV file
	for email in emails:
	writer.writerow([email])