Skip to content

Instantly share code, notes, and snippets.

@jjasont
Created December 20, 2018 15:30
Show Gist options
  • Select an option

  • Save jjasont/ad65d8f004e7f1308a9868902e6536ae to your computer and use it in GitHub Desktop.

Select an option

Save jjasont/ad65d8f004e7f1308a9868902e6536ae to your computer and use it in GitHub Desktop.
Retrieve daily forecast weather data
# Import necessary package
import wget
import os
import json
from multiprocessing import Pool
import time
start_time = time.time()
# Initialize variable
root_store = '/content/gdrive/My Drive/Weather_Forecast_Data_Gathering/' # Change to local machine directory
root_download_link = 'http://www.weather.gov.sg/files/dailydata/'
file_format = ['.csv','.pdf']
file_folder = list(map(lambda x: x.replace('.','_').upper(), file_format))
file_format_folder = dict({extension : subfolder for extension, subfolder in zip(file_format, file_folder)})
file_name_prefix = 'DAILYDATA_'
# Create subfolder
if not os.path.exists(os.path.join(root_store)):
print("Creating folder {0}".format(os.path.join(root_store)))
os.mkdir(os.path.join(root_store))
else:
print("Subfolder {0} exist".format(os.path.join(root_store)))
for sub_folder in file_folder:
if not os.path.exists(os.path.join(root_store, sub_folder)):
print("Creating folder {0}".format(os.path.join(root_store, sub_folder)))
os.mkdir(os.path.join(root_store,sub_folder))
else:
print("Subfolder {0} exist".format(os.path.join(root_store, sub_folder)))
# Load weather forecast station information
with open('/content/gdrive/My Drive/test/weather_station.json') as f: # Change to local machine directory and give JSON file
stations = json.load(f)
# Initialize year and month parameter
years = list(map(str,range(2018,1979,-1))) # generator
def padMonth(month_number, padding = 2):
return str(month_number).zfill(padding)
months = list(map(padMonth,range(12,0,-1))) # generator
year_months = [year + month for year in years for month in months]
# Pair of link files and where to store
h = [(root_download_link+file_name_prefix + station_code + '_' + year_month + ext,
os.path.join(root_store, subfolder, file_name_prefix + station_code + '_' + year_month + ext))
for ext, subfolder in file_format_folder.items()
for station in stations
for year_month in year_months]
def retrieveFile(x):
if os.path.exists(x[1]):
print("EXIST: {0!r} has been downloaded before".format(os.path.split(x[1])[1]))
return
else:
try:
wget.download(x[0], x[1])
print("DOWNLOAD SUCCESS : {0!r}".format(os.path.split(x[1])[1]))
except Exception as e:
print("{1!r} - File {0!r} not found".format(x[0], e))
if __name__ == '__main__':
p = Pool(5)
p.map(retrieveFile, h)
print("--- %s seconds ---" % (time.time() - start_time))
# Retrieve CSV and PDF
# i = 0
# one_percent_progress = round(0.01*number_of_files)
# for ext, subfolder in file_format_folder.items():
# for station in stations:
# station_code = station.get("station_code")
# station_name = station.get("station_name")
# for year_month in year_months:
# try:
# file_name = file_name_prefix + station_code + '_' + year_month + ext
# wget.download(root_download_link + file_name, os.path.join(root_store, subfolder, file_name))
# print("{0!r} has been downloaded".format(file_name))
# i += 1
# except:
# print("File {0!r} not found ({1!r}, {2!r})".format(file_name, station_name, year_month))
# number_of_files -= 1
# one_percent_progress = round(0.01*number_of_files)
# if (i % one_percent_progress == 0):
# print("Progress {0} % ({1} out of {2})".format(100*i/one_percent_progress,i, number_of_files))
# print("Retrieval for station {0!r} format {1!r} complete".format(station_name, ext))
# print("Retrieval for format {0!r} complete".format(ext))
# print("Retrieval completed".format(ext))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment