Created
April 20, 2022 01:28
-
-
Save audhiaprilliant/dd6f369f223de12c845bb00471ed7d71 to your computer and use it in GitHub Desktop.
How to Automatically Build Stopwords
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # List of URL | |
| urls = [ | |
| 'https://www.gutenberg.org/files/1661/1661-0.txt', | |
| 'https://www.gutenberg.org/files/2701/2701-0.txt', | |
| 'https://www.gutenberg.org/files/11/11-0.txt', | |
| 'https://www.gutenberg.org/files/98/98-0.txt', | |
| 'https://www.gutenberg.org/files/74/74-0.txt' | |
| ] | |
| # Text | |
| text = '' | |
| # Loop URLs | |
| for url in urls: | |
| # Make a GET request for the story | |
| f = requests.get(url) | |
| # Character set encoding: UTF-8 | |
| text_partial = f.content.decode( | |
| encoding = 'utf-8', | |
| errors = 'replace' | |
| ) | |
| # Text preprocessing | |
| # Texts for the start and end of main content | |
| START = 'START OF THE PROJECT GUTENBERG EBOOK' | |
| END = 'END OF THE PROJECT GUTENBERG EBOOK' | |
| # Remove the start mark | |
| text_partial = re.findall( | |
| pattern = START + '.+\n([\s\S]+|[\d\D]+|[\w\W]+)', | |
| string = text_partial)[0] | |
| # Remove the end mark | |
| text_partial = re.findall( | |
| pattern = '([\s\S]+|[\d\D]+|[\w\W]+)' + END, | |
| string = text_partial)[0] | |
| # Append text | |
| text += text_partial | |
| # Status | |
| print('Access and preprocess >>> {url}'.format( | |
| url = url | |
| ) | |
| ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment