Last active
May 5, 2021 21:43
-
-
Save derekpowell/436407bda5fe9c2d62c632ed00146b2e to your computer and use it in GitHub Desktop.
Create sqlite database of NOAA GHCN daily weather data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# script to create sqlite database of daily GHCN weather station data | |
# adding a table of US zipcodes for my own purposes (easily commented out) | |
# cribs heavily from https://github.com/dylburger/noaa-ghcn-weather-data | |
# Settings | |
import sqlalchemy | |
import os | |
import urllib.request | |
import pandas as pd | |
years = list(range(2010, 2021)) # range of years to download data | |
db_file = 'ghcn-daily-us.db' # database filename | |
folder = "ghcn/" # folder to save .csv.gz files in. set to "" to use local directory | |
## ------ script | |
def get_ghcd_daily_files(years, prefix=None, silent=False): | |
# years = str, int, or list of years (str or int) | |
# folder = sub folder name to save files in | |
if type(years) != list: | |
years = [years] | |
url_base = "https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/XXXX.csv.gz" | |
for year in years: | |
fname = prefix + str(year) + ".csv.gz" | |
urllib.request.urlretrieve(url_base.replace( | |
"XXXX", str(year)), filename=fname) | |
if not silent: | |
print(year, "data downloaded") | |
# create db connection | |
conn = sqlalchemy.create_engine("sqlite:///" + db_file) | |
# add stations table | |
station_metadata = pd.read_csv('https://www1.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt', | |
sep='\s+', # Fields are separated by one or more spaces | |
# Grab only the first 4 columns | |
usecols=[0, 1, 2, 3], | |
# Missing elevation is noted as -999.9 | |
na_values=[-999.9], | |
header=None, | |
names=['station_id', 'latitude', 'longitude', 'elevation']) | |
station_metadata.to_sql('stations', conn, if_exists="replace") | |
# add zipcodes table | |
zip_data = pd.read_csv( | |
"https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2020_Gazetteer/2020_Gaz_zcta_national.zip", | |
sep='\s+', | |
header=0, | |
names=["zip", "area_land", "area_water", "area_land_sqmi", | |
"area_water_sqmi", "latitude", "longitude"] | |
) | |
zip_data.to_sql("zipcodes", conn, index=False) | |
# download csv files and append to weather table | |
for year in years: | |
get_ghcd_daily_files(year, folder) | |
weather_data = pd.read_csv(folder + str(year) + ".csv.gz", | |
header=None, | |
index_col=False, | |
names=['station_id', | |
'measurement_date', | |
'measurement_type', | |
'measurement_flag', | |
'quality_flag', | |
'source_flag', | |
'observation_time'], | |
parse_dates=['measurement_date']) | |
# select only US stations | |
weather_data = weather_data[weather_data.station_id.str.startswith("US")] | |
weather_data_subset = weather_data[weather_data.measurement_type.isin(['PRCP', 'SNOW', 'SNWD', 'TMAX', 'TMIN'])][[ | |
'station_id', 'measurement_date', 'measurement_type', 'measurement_flag']] | |
weather_data_subset.to_sql( | |
'weather', conn, chunksize=100000, index_label='id', if_exists="append") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment