Last active
June 25, 2020 13:10
-
-
Save robotcator/1fb0cdc1437515f5662d33368554f4c8 to your computer and use it in GitHub Desktop.
a script downloads the text8.zip corpus.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| import six | |
| import os | |
| if six.PY3: | |
| from urllib.request import urlopen | |
| else: | |
| from urllib2 import urlopen | |
| from gensim.models.word2vec import Text8Corpus | |
| URL = ("http://mattmahoney.net/dc/" | |
| "text8.zip") | |
| ARCHIVE_NAME = "text8.zip" | |
| def download_text8(target_dir=None): | |
| """ | |
| Download the text8.zip data and stored it in target_dir. | |
| (http://mattmahoney.net/dc/text8.zip) | |
| if the target_dir is not speficed, then create a folder | |
| named 'GENSIM_DATA' in the user home folder | |
| """ | |
| if target_dir is None: | |
| target_dir = os.path.join("~", "GENSIM_DATA") | |
| target_dir = os.path.expanduser(target_dir) | |
| archive_path = os.path.join(target_dir, ARCHIVE_NAME) | |
| if os.path.exists(archive_path): | |
| # Download is not complete as the zip file is removed after download. | |
| os.remove(archive_path) | |
| opener = urlopen(URL) | |
| with open(archive_path, 'wb') as f: | |
| f.write(opener.read()) | |
| return archive_path | |
| if __name__ == "__main__": | |
| file_path = download_text8("~") |
Yes, you are right. The code is updated. Thanks.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, thanks for the script. Just a little notice, in line 12 you have a typo. It says 'improt' instead of 'import'. Great otherwise 👍