Skip to content

Instantly share code, notes, and snippets.

@hernamesbarbara
Last active October 20, 2018 20:21
Show Gist options
  • Save hernamesbarbara/d768cf61adfd8e7054697c78ec957b8f to your computer and use it in GitHub Desktop.
Save hernamesbarbara/d768cf61adfd8e7054697c78ec957b8f to your computer and use it in GitHub Desktop.
fetch any mailto email addresses found on a web page

basic script to extract any email addresses from a website that can be found

usage

get all emails you can find from this page:

http://zombierecords.com/staff

$   python3 findemails.py https://zombierecords.com/staff/
email,label
[email protected],[email protected]
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
[email protected],Contact
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""findemails.py
Usage:
findemails URL [--output OUTPUT]
Arguments:
URL Website from which you want to extract email addresses
Options:
-o --output OUTPUT [default: stdout].
-h --help Show this message.
Examples:
findemails https://zombierecords.com/staff/
findemails findemails.py https://zombierecords.com/staff/ --output zombie-emails.csv
"""
import os
import sys
from docopt import docopt
import requests
from bs4 import BeautifulSoup
import html5lib
import pandas as pd
def get_soup(url):
r = requests.get(url)
try:
soup = BeautifulSoup(r.text, "html5lib")
except Exception as err:
sys.stderr.write(str(err))
soup = None
return soup
def find_emails(soup):
emails = []
for tag in soup.find_all("a"):
link = tag.get('href','').strip()
if link and link.startswith('mailto:'):
email = link.split(':')[-1].strip()
label = tag.get_text().strip()
emails.append({"email": email, "label": label})
return emails
def main():
args = docopt(__doc__)
url = args['URL']
output = args['--output']
soup = get_soup(url)
if not soup:
sys.stderr.write('couldnt access the URL provided')
sys.exit(1)
emails = find_emails(soup)
if not emails:
sys.stderr.write('couldnt find any emails')
sys.exit(1)
emails = pd.DataFrame(emails)
if output == 'stdout':
outfile = sys.stdout
else:
outfile = output
emails.to_csv(outfile, index=False, encoding='utf-8')
sys.exit(0)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment