jackcrane · August 29, 2020 01:40
diff --git a/rec.py b/rec.py
 #Simple, not all that good email scraper that recursively indexes webpages from a domain (replace varible "site" on line 38)

 from bs4 import BeautifulSoup
 import requests

 f = open("data.txt","a")

 # lists
 urls=[]

 # function created
 def scrape(site):

 	# getting the request from url
 	r = requests.get(site)

 	# converting the text
 	s = BeautifulSoup(r.text,"html.parser")

 	for i in s.find_all("a"):

 		href = i.attrs['href']
 		print href
 		if "@" in href:
 			site = site+href

 			if site not in urls:
 				urls.append(site)
 				f.write((href.split(":")[1]).split("?")[0]+"\n")
 				# calling it self
 				scrape(site)
 				print(href)

 # main function
 if __name__ =="__main__":

 	# website to be scrape
 	site="https://medill.northwestern.edu/directory/faculty/journalism/index.html"

 	# calling function
 	scrape(site)
	#Simple, not all that good email scraper that recursively indexes webpages from a domain (replace varible "site" on line 38)

	from bs4 import BeautifulSoup
	import requests

	f = open("data.txt","a")

	# lists
	urls=[]

	# function created
	def scrape(site):

	# getting the request from url
	r = requests.get(site)

	# converting the text
	s = BeautifulSoup(r.text,"html.parser")

	for i in s.find_all("a"):

	href = i.attrs['href']
	print href
	if "@" in href:
	site = site+href

	if site not in urls:
	urls.append(site)
	f.write((href.split(":")[1]).split("?")[0]+"\n")
	# calling it self
	scrape(site)
	print(href)

	# main function
	if __name__ =="__main__":

	# website to be scrape
	site="https://medill.northwestern.edu/directory/faculty/journalism/index.html"

	# calling function
	scrape(site)