Created
October 1, 2019 02:50
-
-
Save Jiali-Qi/e6ffe256a60b382da80cc6db49dda856 to your computer and use it in GitHub Desktop.
Assignment 3
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Sep 23 11:06:35 2019 | |
@author: qijia | |
""" | |
import queue | |
import re | |
import selenium | |
from urllib.parse import urlparse | |
from urllib.parse import urljoin | |
from bs4 import BeautifulSoup | |
import requests | |
from selenium import webdriver | |
import pathlib | |
def is_absolute(url): | |
"""Determine whether URL is absolute.""" | |
return bool(urlparse(url).netloc) | |
options = webdriver.ChromeOptions() | |
options.add_argument("headless") | |
driver = webdriver.Chrome(executable_path='C:/Users/qijia/Downloads/chromedriver_win32/chromedriver.exe', chrome_options=options) | |
email_addresses = [] | |
q = queue.Queue() | |
q.put("https://www.stevens.edu/") | |
for i in range(10000): | |
url = q.get() | |
# r = requests.get(url) | |
driver.get(url) | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
# Extract all email addresses. | |
# print(soup.get_text()) | |
email_addresses += re.findall("\[email protected]", soup.get_text()) | |
email_addresses = list(set(email_addresses)) | |
links = soup.find_all('a') | |
for link in links: | |
url_list = ['https://www.stevens.edu/'] | |
u = link.get('href') | |
if not is_absolute(u): | |
u = urljoin(url, u) | |
if u not in url_list: | |
q.put(u) | |
print("Queue size: {}".format(q.qsize())) | |
print("# email addresses: {}".format(len(email_addresses))) | |
with open("email.txt", "w+") as f: | |
for e in email_addresses: | |
f.write(e + "\n") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment