Last active
December 2, 2019 00:04
-
-
Save brettpetch/fd3198fcc248b90c87df5e0e93d6f67c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def data_check(url='https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_multilingual_US_v1_00.tsv.gz', | |
file='./data/amazon_reviews_multilingual_US_v1_00.tsv'): | |
""" | |
The following function does the following: | |
1. Check for file to exist | |
- Go to 3. | |
2. If file not exists: | |
a. Check if there is a data folder. | |
b. If there is no data folder, create one. | |
c. Download data from the website to the data folder. | |
d. Unzip data using gzip | |
3. Load Data. | |
:param url: File download address | |
:param file: File location (rel os) | |
:return: loaded data from external function. | |
""" | |
import gzip | |
import os | |
import shutil | |
import urllib.request | |
try: | |
if os.path.isfile(file): | |
print("Found data!") | |
else: | |
print("Data not found.") | |
directory = "./data/" | |
if not os.path.exists(directory): | |
print("Creating directory...") | |
os.makedirs(directory) | |
print("Downloading data... This may take a while (1.5gb)") | |
urllib.request.urlretrieve(url, './data/amazon_reviews_multilingual_US_v1_00.tsv.gz') | |
print("Unzipping data") | |
with gzip.open(directory + 'amazon_reviews_multilingual_US_v1_00.tsv.gz', 'rb') as f_in: | |
with open(file, 'wb') as f_out: | |
shutil.copyfileobj(f_in, f_out) | |
print("Cleaning up...") | |
os.remove('./data/amazon_reviews_multilingual_US_v1_00.tsv.gz') | |
print("Done.") | |
finally: | |
return load_data(file) | |
data = data_check() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment