bobquest33 · June 4, 2017 06:55
diff --git a/extract_email_currentjobs.py b/extract_email_currentjobs.py
 import requests
 from bs4 import BeautifulSoup
 r = requests.get("https://bigdatacv.com/currentjobs/")
 content = r.text
 soup = BeautifulSoup(content, 'html.parser')
 print(soup.prettify())

 [s.extract() for s in soup('script')]
 [s.extract() for s in soup('style')]
 [s.extract() for s in soup('img')]
 [s.extract() for s in soup('a')]
 print(soup.text)

 import html2text
 text = html2text.html2text(soup.prettify())
 print(text)

 import re
 # Code from https://github.com/fredericpierron/extract-email-from-text-python-3/
 regex = re.compile(("([a-z0-9!#$%&*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`"
                    "{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|"
                    "\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))

 def get_emails(s):
    """Returns an iterator of matched emails found in string s."""
    # Removing lines that start with '//' because the regular expression
    # mistakenly matches patterns like 'http://[email protected]' as '//[email protected]'.
    return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//'))

 for email in get_emails(text):
    print (email)
	import requests
	from bs4 import BeautifulSoup
	r = requests.get("https://bigdatacv.com/currentjobs/")
	content = r.text
	soup = BeautifulSoup(content, 'html.parser')
	print(soup.prettify())

	[s.extract() for s in soup('script')]
	[s.extract() for s in soup('style')]
	[s.extract() for s in soup('img')]
	[s.extract() for s in soup('a')]
	print(soup.text)

	import html2text
	text = html2text.html2text(soup.prettify())
	print(text)

	import re
	# Code from https://github.com/fredericpierron/extract-email-from-text-python-3/
	regex = re.compile(("([a-z0-9!#$%&+\/=?^_`{\|}~-]+(?:\.[a-z0-9!#$%&'+\/=?^_`"
	"{\|}~-]+)(@\|\sat\s)(?:[a-z0-9](?:[a-z0-9-][a-z0-9])?(\.\|"
	"\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)"))

	def get_emails(s):
	"""Returns an iterator of matched emails found in string s."""
	# Removing lines that start with '//' because the regular expression
	# mistakenly matches patterns like 'http://[email protected]' as '//[email protected]'.
	return (email[0] for email in re.findall(regex, s) if not email[0].startswith('//'))

	for email in get_emails(text):
	print (email)
No results found