Created
May 12, 2021 15:23
-
-
Save fnielsen/d69ee6b880386466098f9974091ccd80 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!//usr/bin/env python | |
""" | |
Usage: | |
url_to_sentence_collector_sentences <url> | |
Description: | |
https://commonvoice.mozilla.org/sentence-collector/#/how-to | |
""" | |
from docopt import docopt | |
from requests import get | |
from lxml.etree import HTML | |
from re import sub | |
from nltk import sent_tokenize, word_tokenize | |
arguments = docopt(__doc__) | |
url = arguments["<url>"] | |
# Retsinformation.dk now uses Javascript to get the body text. | |
if url.startswith('https://www.retsinformation.dk/'): | |
url = url[:31] + "api/document/" + url[31:] | |
# Get the webpage | |
response = get(url) | |
if not response.ok: | |
print(response.status) | |
exit(0) | |
# Get the HTML from the response | |
if url.startswith('https://www.retsinformation.dk/'): | |
data = response.json() | |
html = data[0]['documentHtml'] | |
else: | |
html = response.content | |
# Extract raw text | |
tree = HTML(html) | |
texts = tree.xpath("//text()") | |
text = " ".join(texts) | |
cleaned_text = sub(r"\s+", " ", text) | |
# Parse sentence. Only sentence with less than 14 words and now numbers | |
# can be used. | |
sentences = sent_tokenize(cleaned_text) | |
for sentence in sentences: | |
words = word_tokenize(sentence) | |
if len(words) <= 14 and not any(char.isdigit() for char in sentence): | |
print(sentence) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Abbreviations and symbol-containing sentences is not excluded, e.g., "EU" and "/".