Created
December 20, 2018 15:30
-
-
Save jjasont/ad65d8f004e7f1308a9868902e6536ae to your computer and use it in GitHub Desktop.
Retrieve daily forecast weather data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Import necessary package | |
| import wget | |
| import os | |
| import json | |
| from multiprocessing import Pool | |
| import time | |
| start_time = time.time() | |
| # Initialize variable | |
| root_store = '/content/gdrive/My Drive/Weather_Forecast_Data_Gathering/' # Change to local machine directory | |
| root_download_link = 'http://www.weather.gov.sg/files/dailydata/' | |
| file_format = ['.csv','.pdf'] | |
| file_folder = list(map(lambda x: x.replace('.','_').upper(), file_format)) | |
| file_format_folder = dict({extension : subfolder for extension, subfolder in zip(file_format, file_folder)}) | |
| file_name_prefix = 'DAILYDATA_' | |
| # Create subfolder | |
| if not os.path.exists(os.path.join(root_store)): | |
| print("Creating folder {0}".format(os.path.join(root_store))) | |
| os.mkdir(os.path.join(root_store)) | |
| else: | |
| print("Subfolder {0} exist".format(os.path.join(root_store))) | |
| for sub_folder in file_folder: | |
| if not os.path.exists(os.path.join(root_store, sub_folder)): | |
| print("Creating folder {0}".format(os.path.join(root_store, sub_folder))) | |
| os.mkdir(os.path.join(root_store,sub_folder)) | |
| else: | |
| print("Subfolder {0} exist".format(os.path.join(root_store, sub_folder))) | |
| # Load weather forecast station information | |
| with open('/content/gdrive/My Drive/test/weather_station.json') as f: # Change to local machine directory and give JSON file | |
| stations = json.load(f) | |
| # Initialize year and month parameter | |
| years = list(map(str,range(2018,1979,-1))) # generator | |
| def padMonth(month_number, padding = 2): | |
| return str(month_number).zfill(padding) | |
| months = list(map(padMonth,range(12,0,-1))) # generator | |
| year_months = [year + month for year in years for month in months] | |
| # Pair of link files and where to store | |
| h = [(root_download_link+file_name_prefix + station_code + '_' + year_month + ext, | |
| os.path.join(root_store, subfolder, file_name_prefix + station_code + '_' + year_month + ext)) | |
| for ext, subfolder in file_format_folder.items() | |
| for station in stations | |
| for year_month in year_months] | |
| def retrieveFile(x): | |
| if os.path.exists(x[1]): | |
| print("EXIST: {0!r} has been downloaded before".format(os.path.split(x[1])[1])) | |
| return | |
| else: | |
| try: | |
| wget.download(x[0], x[1]) | |
| print("DOWNLOAD SUCCESS : {0!r}".format(os.path.split(x[1])[1])) | |
| except Exception as e: | |
| print("{1!r} - File {0!r} not found".format(x[0], e)) | |
| if __name__ == '__main__': | |
| p = Pool(5) | |
| p.map(retrieveFile, h) | |
| print("--- %s seconds ---" % (time.time() - start_time)) | |
| # Retrieve CSV and PDF | |
| # i = 0 | |
| # one_percent_progress = round(0.01*number_of_files) | |
| # for ext, subfolder in file_format_folder.items(): | |
| # for station in stations: | |
| # station_code = station.get("station_code") | |
| # station_name = station.get("station_name") | |
| # for year_month in year_months: | |
| # try: | |
| # file_name = file_name_prefix + station_code + '_' + year_month + ext | |
| # wget.download(root_download_link + file_name, os.path.join(root_store, subfolder, file_name)) | |
| # print("{0!r} has been downloaded".format(file_name)) | |
| # i += 1 | |
| # except: | |
| # print("File {0!r} not found ({1!r}, {2!r})".format(file_name, station_name, year_month)) | |
| # number_of_files -= 1 | |
| # one_percent_progress = round(0.01*number_of_files) | |
| # if (i % one_percent_progress == 0): | |
| # print("Progress {0} % ({1} out of {2})".format(100*i/one_percent_progress,i, number_of_files)) | |
| # print("Retrieval for station {0!r} format {1!r} complete".format(station_name, ext)) | |
| # print("Retrieval for format {0!r} complete".format(ext)) | |
| # print("Retrieval completed".format(ext)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment