Skip to content

Instantly share code, notes, and snippets.

@nullenc0de
Created August 30, 2022 19:18
Show Gist options
  • Save nullenc0de/64c88827c8ffd5a6e3b7e3954c5005e0 to your computer and use it in GitHub Desktop.
Save nullenc0de/64c88827c8ffd5a6e3b7e3954c5005e0 to your computer and use it in GitHub Desktop.
import re
import csv
import time
import urllib.request
# Enter the URL you want to start from
url = input("Enter the starting URL: ")
# This is where we store the links we've already visited
visited = set()
# This is where we store the emails we've found
emails = set()
# Let the user know what's going on
print("Scanning for email addresses...")
# We want to keep track of how many links we've visited
link_count = 0
queue = []
# We also want to keep track of how many emails we've found
email_count = 0
# This is a regular expression for finding email addresses
email_regex = re.compile(r"[a-z0-9\.\-\_\+]+@[a-z0-9\-\_]+(\.[a-z0-9\-\_]+){1,}", re.I)
# This is a regular expression for finding links
link_regex = re.compile(r"<a [^>]*href=['\"]([^'\"]+)", re.I)
# This is a regular expression for finding the domain of a link
domain_regex = re.compile(r"^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/\n]+)", re.I)
# This is the domain we're interested in
domain = domain_regex.match(url).group(1)
# We use this to keep track of when we should display a status update
last_time = time.time()
# This is the main loop
while True:
# Try to request the URL
try:
# Get the URL's contents
response = urllib.request.urlopen(url)
# Convert the response to text
html = response.read().decode("utf-8")
except:
# Something went wrong, so just skip this URL
print("Error fetching %s" % url)
break
# We visited one more link
link_count += 1
# Update the user every 10 links
if link_count % 10 == 0 and time.time() - last_time > 1:
print("Found %s email addresses in %s links" % (email_count, link_count))
last_time = time.time()
# Find all the email addresses in the response
for match in email_regex.finditer(html):
# We found one more email address
email_count += 1
# Add it to our list
emails.add(match.group(0))
# Find all the links in the response
for match in link_regex.finditer(html):
# Get the link
link = match.group(1)
# If it's not a link to our domain, skip it
if not domain_regex.match(link):
continue
# If we haven't visited it yet, add it to the queue
if link not in visited:
# Mark it as visited
visited.add(link)
# Add it to the queue
queue.append(link)
# If we're out of links, we're done
if len(queue) == 0:
break
# Get the next link in the queue
url = queue.pop(0)
# Let the user know we're done
print("Done! Found %s email addresses in %s links" % (email_count, link_count))
# Ask the user if they want to save the results
save = input("Do you want to save the results to a CSV file? (y/n) ")
# If they want to save the results
if save.lower() == "y":
# Get a filename from the user
filename = input("Enter a filename: ")
# Open the file for writing
with open(filename, "w") as file:
# Create a CSV writer
writer = csv.writer(file)
# Write the emails to the CSV file
for email in emails:
writer.writerow([email])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment