Everfighting · July 18, 2019 03:10
diff --git a/extract_text b/extract_text
 import requests
 from bs4 import BeautifulSoup

 url = 'https://www.troyhunt.com/the-773-million-record-collection-1-data-reach/'
 res = requests.get(url)
 html_page = res.content
 soup = BeautifulSoup(html_page, 'html.parser')
 text = soup.find_all(text=True)

 output = ''
 blacklist = [
    '[document]',
    'noscript',
    'header',
    'html',
    'meta',
    'head', 
    'input',
    'script',
    # there may be more elements you don't want, such as "style", etc.
 ]

 for t in text:
    if t.parent.name not in blacklist:
        output += '{} '.format(t)

 print(output)
	import requests
	from bs4 import BeautifulSoup

	url = 'https://www.troyhunt.com/the-773-million-record-collection-1-data-reach/'
	res = requests.get(url)
	html_page = res.content
	soup = BeautifulSoup(html_page, 'html.parser')
	text = soup.find_all(text=True)

	output = ''
	blacklist = [
	'[document]',
	'noscript',
	'header',
	'html',
	'meta',
	'head',
	'input',
	'script',
	# there may be more elements you don't want, such as "style", etc.
	]

	for t in text:
	if t.parent.name not in blacklist:
	output += '{} '.format(t)

	print(output)