Created
August 30, 2022 19:18
-
-
Save nullenc0de/64c88827c8ffd5a6e3b7e3954c5005e0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import csv | |
import time | |
import urllib.request | |
# Enter the URL you want to start from | |
url = input("Enter the starting URL: ") | |
# This is where we store the links we've already visited | |
visited = set() | |
# This is where we store the emails we've found | |
emails = set() | |
# Let the user know what's going on | |
print("Scanning for email addresses...") | |
# We want to keep track of how many links we've visited | |
link_count = 0 | |
queue = [] | |
# We also want to keep track of how many emails we've found | |
email_count = 0 | |
# This is a regular expression for finding email addresses | |
email_regex = re.compile(r"[a-z0-9\.\-\_\+]+@[a-z0-9\-\_]+(\.[a-z0-9\-\_]+){1,}", re.I) | |
# This is a regular expression for finding links | |
link_regex = re.compile(r"<a [^>]*href=['\"]([^'\"]+)", re.I) | |
# This is a regular expression for finding the domain of a link | |
domain_regex = re.compile(r"^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/\n]+)", re.I) | |
# This is the domain we're interested in | |
domain = domain_regex.match(url).group(1) | |
# We use this to keep track of when we should display a status update | |
last_time = time.time() | |
# This is the main loop | |
while True: | |
# Try to request the URL | |
try: | |
# Get the URL's contents | |
response = urllib.request.urlopen(url) | |
# Convert the response to text | |
html = response.read().decode("utf-8") | |
except: | |
# Something went wrong, so just skip this URL | |
print("Error fetching %s" % url) | |
break | |
# We visited one more link | |
link_count += 1 | |
# Update the user every 10 links | |
if link_count % 10 == 0 and time.time() - last_time > 1: | |
print("Found %s email addresses in %s links" % (email_count, link_count)) | |
last_time = time.time() | |
# Find all the email addresses in the response | |
for match in email_regex.finditer(html): | |
# We found one more email address | |
email_count += 1 | |
# Add it to our list | |
emails.add(match.group(0)) | |
# Find all the links in the response | |
for match in link_regex.finditer(html): | |
# Get the link | |
link = match.group(1) | |
# If it's not a link to our domain, skip it | |
if not domain_regex.match(link): | |
continue | |
# If we haven't visited it yet, add it to the queue | |
if link not in visited: | |
# Mark it as visited | |
visited.add(link) | |
# Add it to the queue | |
queue.append(link) | |
# If we're out of links, we're done | |
if len(queue) == 0: | |
break | |
# Get the next link in the queue | |
url = queue.pop(0) | |
# Let the user know we're done | |
print("Done! Found %s email addresses in %s links" % (email_count, link_count)) | |
# Ask the user if they want to save the results | |
save = input("Do you want to save the results to a CSV file? (y/n) ") | |
# If they want to save the results | |
if save.lower() == "y": | |
# Get a filename from the user | |
filename = input("Enter a filename: ") | |
# Open the file for writing | |
with open(filename, "w") as file: | |
# Create a CSV writer | |
writer = csv.writer(file) | |
# Write the emails to the CSV file | |
for email in emails: | |
writer.writerow([email]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment