fnielsen · May 12, 2021 15:23 · fnielsen · May 12, 2021
diff --git a/url_to_sentence_collector_sentences.py b/url_to_sentence_collector_sentences.py
 #!//usr/bin/env python
 """
 Usage:
  url_to_sentence_collector_sentences <url>

 Description:
  https://commonvoice.mozilla.org/sentence-collector/#/how-to

 """
 from docopt import docopt
 from requests import get
 from lxml.etree import HTML
 from re import sub
 from nltk import sent_tokenize, word_tokenize


 arguments = docopt(__doc__)

 url = arguments["<url>"]

 # Retsinformation.dk now uses Javascript to get the body text.
 if url.startswith('https://www.retsinformation.dk/'):
    url = url[:31] + "api/document/" + url[31:]

 # Get the webpage
 response = get(url)
 if not response.ok:
    print(response.status)
    exit(0)

 # Get the HTML from the response
 if url.startswith('https://www.retsinformation.dk/'):
    data = response.json()
    html = data[0]['documentHtml']
 else:
    html = response.content

 # Extract raw text
 tree = HTML(html)
 texts = tree.xpath("//text()")
 text = " ".join(texts)
 cleaned_text = sub(r"\s+", " ", text)

 # Parse sentence. Only sentence with less than 14 words and now numbers
 # can be used.
 sentences = sent_tokenize(cleaned_text)
 for sentence in sentences:
    words = word_tokenize(sentence)
    if len(words) <= 14 and not any(char.isdigit() for char in sentence):
        print(sentence)
	#!//usr/bin/env python
	"""
	Usage:
	url_to_sentence_collector_sentences <url>

	Description:
	https://commonvoice.mozilla.org/sentence-collector/#/how-to

	"""
	from docopt import docopt
	from requests import get
	from lxml.etree import HTML
	from re import sub
	from nltk import sent_tokenize, word_tokenize


	arguments = docopt(__doc__)

	url = arguments["<url>"]

	# Retsinformation.dk now uses Javascript to get the body text.
	if url.startswith('https://www.retsinformation.dk/'):
	url = url[:31] + "api/document/" + url[31:]

	# Get the webpage
	response = get(url)
	if not response.ok:
	print(response.status)
	exit(0)

	# Get the HTML from the response
	if url.startswith('https://www.retsinformation.dk/'):
	data = response.json()
	html = data[0]['documentHtml']
	else:
	html = response.content

	# Extract raw text
	tree = HTML(html)
	texts = tree.xpath("//text()")
	text = " ".join(texts)
	cleaned_text = sub(r"\s+", " ", text)

	# Parse sentence. Only sentence with less than 14 words and now numbers
	# can be used.
	sentences = sent_tokenize(cleaned_text)
	for sentence in sentences:
	words = word_tokenize(sentence)
	if len(words) <= 14 and not any(char.isdigit() for char in sentence):
	print(sentence)