typesupply · March 3, 2021 16:38
diff --git a/sentenceExtractor.py b/sentenceExtractor.py
 from urllib.request import build_opener
 import bs4
 from nltk.tokenize import PunktSentenceTokenizer

 # Pull some text from a Wikipedia entry.
 # The text could also come from a string, file or whatever.

 url = "https://en.wikipedia.org/wiki/Font,_Switzerland"

 opener = build_opener()
 opener.addheaders = [("User-agent", "Mozilla/5.0")]
 request = opener.open(url)
 html = request.read()
 request.close()
 try:
    soup = bs4.BeautifulSoup(html)
 except:
    print("[ERROR] Couldn't parse:", url)
 tags = ["p", "table", "td", "li", "dfn", "dd"]
 all = []
 for tag in tags:
    for i in soup.find_all(tag):
        text = i.text
        if tag == "table":
            found = text.splitlines()
        else:
            found = [text]
        all.extend(found)
 text = "\n".join(all)

 # Extract sentences from the text.

 sentences = PunktSentenceTokenizer().tokenize(text)
 for sentence in sentences:
    print(sentence)
	from urllib.request import build_opener
	import bs4
	from nltk.tokenize import PunktSentenceTokenizer

	# Pull some text from a Wikipedia entry.
	# The text could also come from a string, file or whatever.

	url = "https://en.wikipedia.org/wiki/Font,_Switzerland"

	opener = build_opener()
	opener.addheaders = [("User-agent", "Mozilla/5.0")]
	request = opener.open(url)
	html = request.read()
	request.close()
	try:
	soup = bs4.BeautifulSoup(html)
	except:
	print("[ERROR] Couldn't parse:", url)
	tags = ["p", "table", "td", "li", "dfn", "dd"]
	all = []
	for tag in tags:
	for i in soup.find_all(tag):
	text = i.text
	if tag == "table":
	found = text.splitlines()
	else:
	found = [text]
	all.extend(found)
	text = "\n".join(all)

	# Extract sentences from the text.

	sentences = PunktSentenceTokenizer().tokenize(text)
	for sentence in sentences:
	print(sentence)