Created
May 6, 2020 00:01
-
-
Save gumdropsteve/bf302008387150b76e4226dba314440a to your computer and use it in GitHub Desktop.
January 2009 thru June 2016 - Download TLC Yellow Cab dataset from AWS.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import urllib | |
def data_check(file_name, data_dir='', base_url='https://s3.amazonaws.com/nyc-tlc/trip+data/'): | |
""" | |
1. check if the given {file_name} exists locally in {data_dir} | |
2. download from {base_url} & save in {data_dir) if not | |
INPUTS | |
> file_name | |
>> name of the file that will be downloaded | |
> data_dir | |
>> directory where you want the data saved (default='' aka current directory) | |
> base_url | |
>> most of the URL where each month's data is located | |
""" | |
# tag cloud & relative local paths to data | |
local_data = data_dir + file_name | |
cloud_data = base_url + file_name | |
# do we already have the file? | |
if not os.path.isfile(local_data): | |
# we don't, let me know we're downloading it now, then download it | |
print(f'Downloading {cloud_data[8:]} to {local_data}') | |
urllib.request.urlretrieve(cloud_data, local_data) | |
# january - december | |
for month in range(1, 13): | |
# check if we are in double digit months | |
if month < 10: | |
# add 0 so we have the correct path | |
month = f'0{month}' | |
# go through list of years 2009 - 2016 | |
for year in [f'20{yr}' for yr in range(9, 17)]: | |
# catch 2009 error, add another 0 | |
if year == '209': | |
year = '2009' | |
# not past june 2016 | |
if (int(year) < 2016) or (int(month) < 7): | |
# tag file name for this month | |
fn = f'yellow_tripdata_{year}-{month}.csv' | |
# check this month's data | |
data_check(file_name=fn) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment