Created
August 29, 2020 01:40
-
-
Save jackcrane/646d4d210bc017e636aad6b80bfe05f5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Simple, not all that good email scraper that recursively indexes webpages from a domain (replace varible "site" on line 38) | |
from bs4 import BeautifulSoup | |
import requests | |
f = open("data.txt","a") | |
# lists | |
urls=[] | |
# function created | |
def scrape(site): | |
# getting the request from url | |
r = requests.get(site) | |
# converting the text | |
s = BeautifulSoup(r.text,"html.parser") | |
for i in s.find_all("a"): | |
href = i.attrs['href'] | |
print href | |
if "@" in href: | |
site = site+href | |
if site not in urls: | |
urls.append(site) | |
f.write((href.split(":")[1]).split("?")[0]+"\n") | |
# calling it self | |
scrape(site) | |
print(href) | |
# main function | |
if __name__ =="__main__": | |
# website to be scrape | |
site="https://medill.northwestern.edu/directory/faculty/journalism/index.html" | |
# calling function | |
scrape(site) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment