gumdropsteve · May 6, 2020 00:01
diff --git a/download_nyc_yellow_taxi.py b/download_nyc_yellow_taxi.py
 import os
 import urllib

 def data_check(file_name, data_dir='', base_url='https://s3.amazonaws.com/nyc-tlc/trip+data/'):
    """
    1. check if the given {file_name} exists locally in {data_dir}
    2. download from {base_url} & save in {data_dir) if not
    
    INPUTS
    > file_name 
      >> name of the file that will be downloaded
    > data_dir
      >> directory where you want the data saved (default='' aka current directory)
    > base_url
      >> most of the URL where each month's data is located 
    """
    # tag cloud & relative local paths to data
    local_data = data_dir + file_name
    cloud_data = base_url + file_name
    # do we already have the file?
    if not os.path.isfile(local_data):
        # we don't, let me know we're downloading it now, then download it
        print(f'Downloading {cloud_data[8:]} to {local_data}')
        urllib.request.urlretrieve(cloud_data, local_data)
                
 # january - december
 for month in range(1, 13):
    # check if we are in double digit months
    if month < 10:
        # add 0 so we have the correct path
        month = f'0{month}'
    # go through list of years 2009 - 2016
    for year in [f'20{yr}' for yr in range(9, 17)]:
        # catch 2009 error, add another 0
        if year == '209':
            year = '2009'
        # not past june 2016
        if (int(year) < 2016) or (int(month) < 7):
            # tag file name for this month
            fn = f'yellow_tripdata_{year}-{month}.csv'
            # check this month's data
            data_check(file_name=fn)
	import os
	import urllib

	def data_check(file_name, data_dir='', base_url='https://s3.amazonaws.com/nyc-tlc/trip+data/'):
	"""
	1. check if the given {file_name} exists locally in {data_dir}
	2. download from {base_url} & save in {data_dir) if not

	INPUTS
	> file_name
	>> name of the file that will be downloaded
	> data_dir
	>> directory where you want the data saved (default='' aka current directory)
	> base_url
	>> most of the URL where each month's data is located
	"""
	# tag cloud & relative local paths to data
	local_data = data_dir + file_name
	cloud_data = base_url + file_name
	# do we already have the file?
	if not os.path.isfile(local_data):
	# we don't, let me know we're downloading it now, then download it
	print(f'Downloading {cloud_data[8:]} to {local_data}')
	urllib.request.urlretrieve(cloud_data, local_data)

	# january - december
	for month in range(1, 13):
	# check if we are in double digit months
	if month < 10:
	# add 0 so we have the correct path
	month = f'0{month}'
	# go through list of years 2009 - 2016
	for year in [f'20{yr}' for yr in range(9, 17)]:
	# catch 2009 error, add another 0
	if year == '209':
	year = '2009'
	# not past june 2016
	if (int(year) < 2016) or (int(month) < 7):
	# tag file name for this month
	fn = f'yellow_tripdata_{year}-{month}.csv'
	# check this month's data
	data_check(file_name=fn)