Skip to content

Instantly share code, notes, and snippets.

@2016rshah
Last active August 27, 2018 16:10
Show Gist options
  • Save 2016rshah/f308a7354c0564dc87cde5948a72c34b to your computer and use it in GitHub Desktop.
Save 2016rshah/f308a7354c0564dc87cde5948a72c34b to your computer and use it in GitHub Desktop.
UT Directory Web Scraper
# Usage:
# python3 script.py input.txt output.txt
# (don't forget to take names out of input.txt when you're done with them)
import sys
import requests
from bs4 import BeautifulSoup
baseURL = 'https://directory.utexas.edu/index.php'
def getEmailFromName(name):
payload = {'q': name}
page = requests.get(baseURL, params=payload)
if(page.status_code != 200):
soup = BeautifulSoup(page.text, 'html.parser')
print(soup.prettify())
print("Yikes, bad status")
sys.exit()
soup = BeautifulSoup(page.text, 'html.parser')
emailLinks = [a["href"] for a in soup.select('a[href^=mailto:]')]
emails = [a.split(':')[1] for a in emailLinks]
if(len(emails) == 0):
return "No email found"
elif(len(emails) > 1):
return "Found more than one email"
else:
return emails[0]
with open(sys.argv[1], 'r') as fp:
with open(sys.argv[2], 'w') as op:
line = fp.readline()
while line:
op.write(getEmailFromName(line))
op.write('\n')
line = fp.readline()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment