Skip to content

Instantly share code, notes, and snippets.

@flare9x
Created August 10, 2021 12:42
Show Gist options
  • Save flare9x/e450bba49dc833c403676de1b68ddf78 to your computer and use it in GitHub Desktop.
Save flare9x/e450bba49dc833c403676de1b68ddf78 to your computer and use it in GitHub Desktop.
GHCN parse .dly files, export as CSV, build dataframe for running statistics
# GHCN data
using CSV
using DataFrames
using Dates
using Statistics
using Plots
# Data
# USCHN
# https://cdiac.ess-dive.lbl.gov/epubs/ndp/ushcn/ushcn.html#:~:text=The%20United%20States%20Historical%20Climatology%20Network%20%28USHCN%29%20is,observing%20stations%20across%20the%2048%20contiguous%20United%20States.
# https://docs.google.com/spreadsheets/d/1mWanx8ojmOkcazzRhDaoHMHyDWxR3PqUP0EdtdIUU84/edit#gid=719135559
# Get start / end dates
ghcn_inventory = CSV.read("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd-inventory.txt", DataFrame, header=false, delim=' ', ignorerepeated=true)
start_years = ghcn_inventory[:,5]
min_date = minimum(start_years)
end_years = ghcn_inventory[:,6]
minimum(end_years)
# set index to date stamp yyyymmdd
dr = Date(min_date,1,01):Day(1):Date(today())
date_index = collect(dr)
date_index_tmax = DataFrame(Index = collect(1:1:size(date_index,1)), Date = date_index)
date_index_tmin = DataFrame(Index = collect(1:1:size(date_index,1)), Date = date_index)
######################################################################################################
# Loop through the CSV data left joing to the master index date - extract tmin / tmax data
# load .dly data file
# load as 1x string per row
# split the string per the readme .txt element / column position (static)
######################################################################################################
# read all files in directory
all_dly_files = cd(readdir, "C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all/")
all_dly_files_for_export = chop.(all_dly_files, tail = 4)
# read ghcnd-stations.txt
# subset US stations
ghcn_stations = CSV.read("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd-stations.txt", DataFrame, header=false, delim=' ', ignorerepeated=true)
station_ID = ghcn_stations[:,1]
station_ID_chop = fill("",size(ghcn_stations,1))
US_stations = fill("",size(ghcn_stations,1))
for i = 1:size(ghcn_stations,1)
station_ID_chop[i] = station_ID[i][1:2] # subset first two of the ID
if station_ID_chop[i] == "US"
US_stations[i] = join([station_ID[i], ".dly"])
end
end
US_stations = DataFrame(hcat(US_stations[US_stations .!= ""]))
# Read .csv data and creae one array for statistics
# read all files in directory
all_dly_files = DataFrame(hcat(cd(readdir, "C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all/")))
us_dly = innerjoin(US_stations, all_dly_files, on = :x1)
us_dly = us_dly[:,1]
us_dly_files_for_export = string.(chop.(us_dly, tail = 4))
# initialize output
all_out_tmax = DataFrame(Date = Date[], TMAX = Float64[])
all_out_tmin = DataFrame(Date = Date[], TMIN = Float64[])
all_out_tavg = DataFrame(Date = Date[], TAVG = Float64[])
# start loop
d=1
for d = 1:length(us_dly)
print("Starting dly file ", us_dly[d]," - Iteration ",d,"\n")
data = CSV.read(join(["C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all/", us_dly[d]]), DataFrame, header=false,delim="", ignorerepeated=false)
#data = CSV.read("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all/AGM00060515.dly", DataFrame, header=false,delim="", ignorerepeated=false)
ID = fill("",size(data,1))
YEAR = fill("",size(data,1))
MONTH = fill("",size(data,1))
ELEMENT = fill("",size(data,1))
VALUE1 = fill(0.0,size(data,1))
MFLAG1 = fill("",size(data,1))
QFLAG1 = fill("",size(data,1))
SFLAG1 = fill("",size(data,1))
VALUE2 = fill(0.0,size(data,1))
MFLAG2 = fill("",size(data,1))
QFLAG2 = fill("",size(data,1))
SFLAG2 = fill("",size(data,1))
VALUE3 = fill(0.0,size(data,1))
MFLAG3 = fill("",size(data,1))
QFLAG3 = fill("",size(data,1))
SFLAG3 = fill("",size(data,1))
VALUE4 = fill(0.0,size(data,1))
MFLAG4 = fill("",size(data,1))
QFLAG4 = fill("",size(data,1))
SFLAG4 = fill("",size(data,1))
VALUE5 = fill(0.0,size(data,1))
MFLAG5 = fill("",size(data,1))
QFLAG5 = fill("",size(data,1))
SFLAG5 = fill("",size(data,1))
VALUE6 = fill(0.0,size(data,1))
MFLAG6 = fill("",size(data,1))
QFLAG6 = fill("",size(data,1))
SFLAG6 = fill("",size(data,1))
VALUE7 = fill(0.0,size(data,1))
MFLAG7 = fill("",size(data,1))
QFLAG7 = fill("",size(data,1))
SFLAG7 = fill("",size(data,1))
VALUE8 = fill(0.0,size(data,1))
MFLAG8 = fill("",size(data,1))
QFLAG8 = fill("",size(data,1))
SFLAG8 = fill("",size(data,1))
VALUE9 = fill(0.0,size(data,1))
MFLAG9 = fill("",size(data,1))
QFLAG9 = fill("",size(data,1))
SFLAG9 = fill("",size(data,1))
VALUE10 = fill(0.0,size(data,1))
MFLAG10 = fill("",size(data,1))
QFLAG10 = fill("",size(data,1))
SFLAG10 = fill("",size(data,1))
VALUE11 = fill(0.0,size(data,1))
MFLAG11 = fill("",size(data,1))
QFLAG11 = fill("",size(data,1))
SFLAG11 = fill("",size(data,1))
VALUE12 = fill(0.0,size(data,1))
MFLAG12 = fill("",size(data,1))
QFLAG12 = fill("",size(data,1))
SFLAG12 = fill("",size(data,1))
VALUE13 = fill(0.0,size(data,1))
MFLAG13 = fill("",size(data,1))
QFLAG13 = fill("",size(data,1))
SFLAG13 = fill("",size(data,1))
VALUE14 = fill(0.0,size(data,1))
MFLAG14 = fill("",size(data,1))
QFLAG14 = fill("",size(data,1))
SFLAG14 = fill("",size(data,1))
VALUE15 = fill(0.0,size(data,1))
MFLAG15 = fill("",size(data,1))
QFLAG15 = fill("",size(data,1))
SFLAG15 = fill("",size(data,1))
VALUE16 = fill(0.0,size(data,1))
MFLAG16 = fill("",size(data,1))
QFLAG16 = fill("",size(data,1))
SFLAG16 = fill("",size(data,1))
VALUE17 = fill(0.0,size(data,1))
MFLAG17 = fill("",size(data,1))
QFLAG17 = fill("",size(data,1))
SFLAG17 = fill("",size(data,1))
VALUE18 = fill(0.0,size(data,1))
MFLAG18 = fill("",size(data,1))
QFLAG18 = fill("",size(data,1))
SFLAG18 = fill("",size(data,1))
VALUE19 = fill(0.0,size(data,1))
MFLAG19 = fill("",size(data,1))
QFLAG19 = fill("",size(data,1))
SFLAG19 = fill("",size(data,1))
VALUE20 = fill(0.0,size(data,1))
MFLAG20 = fill("",size(data,1))
QFLAG20 = fill("",size(data,1))
SFLAG20 = fill("",size(data,1))
VALUE21 = fill(0.0,size(data,1))
MFLAG21 = fill("",size(data,1))
QFLAG21 = fill("",size(data,1))
SFLAG21 = fill("",size(data,1))
VALUE22 = fill(0.0,size(data,1))
MFLAG22 = fill("",size(data,1))
QFLAG22 = fill("",size(data,1))
SFLAG22 = fill("",size(data,1))
VALUE23 = fill(0.0,size(data,1))
MFLAG23 = fill("",size(data,1))
QFLAG23 = fill("",size(data,1))
SFLAG23 = fill("",size(data,1))
VALUE24 = fill(0.0,size(data,1))
MFLAG24 = fill("",size(data,1))
QFLAG24 = fill("",size(data,1))
SFLAG24 = fill("",size(data,1))
VALUE25 = fill(0.0,size(data,1))
MFLAG25 = fill("",size(data,1))
QFLAG25 = fill("",size(data,1))
SFLAG25 = fill("",size(data,1))
VALUE26 = fill(0.0,size(data,1))
MFLAG26 = fill("",size(data,1))
QFLAG26 = fill("",size(data,1))
SFLAG26 = fill("",size(data,1))
VALUE27 = fill(0.0,size(data,1))
MFLAG27 = fill("",size(data,1))
QFLAG27 = fill("",size(data,1))
SFLAG27 = fill("",size(data,1))
VALUE28 = fill(0.0,size(data,1))
MFLAG28 = fill("",size(data,1))
QFLAG28 = fill("",size(data,1))
SFLAG28 = fill("",size(data,1))
VALUE29 = fill(0.0,size(data,1))
MFLAG29 = fill("",size(data,1))
QFLAG29 = fill("",size(data,1))
SFLAG29 = fill("",size(data,1))
VALUE30 = fill(0.0,size(data,1))
MFLAG30 = fill("",size(data,1))
QFLAG30 = fill("",size(data,1))
SFLAG30 = fill("",size(data,1))
VALUE31 = fill(0.0,size(data,1))
MFLAG31 = fill("",size(data,1))
QFLAG31 = fill("",size(data,1))
SFLAG31 = fill("",size(data,1))
i =1
for i = 1:size(data,1)
temp = data[i,1]
ID[i] = temp[1:11]
YEAR[i] = temp[12:15]
MONTH[i] = temp[16:17]
ELEMENT[i] = temp[18:21]
VALUE1[i] = parse.(Float64, temp[22:26]) / 10 # day 1 of the month
MFLAG1[i] = string(temp[27])
QFLAG1[i] = string(temp[28])
SFLAG1[i] = string(temp[29])
VALUE2[i] = parse.(Float64, temp[30:34]) / 10 # day 2 of the month
MFLAG2[i] = string(temp[35])
QFLAG2[i] = string(temp[36])
SFLAG2[i] = string(temp[37])
VALUE3[i] = parse.(Float64, temp[38:42]) / 10 # day 3 of the month
MFLAG3[i] = string(temp[43])
QFLAG3[i] = string(temp[44])
SFLAG3[i] = string(temp[45])
VALUE4[i] = parse.(Float64, temp[46:50]) / 10 # day 4 of the month
MFLAG4[i] = string(temp[51])
QFLAG4[i] = string(temp[52])
SFLAG4[i] = string(temp[53])
VALUE5[i] = parse.(Float64, temp[54:58]) / 10 # day 5 of the month
MFLAG5[i] = string(temp[59])
QFLAG5[i] = string(temp[60])
SFLAG5[i] = string(temp[61])
VALUE6[i] = parse.(Float64, temp[62:66]) / 10 # day 6 of the month
MFLAG6[i] = string(temp[67])
QFLAG6[i] = string(temp[68])
SFLAG6[i] = string(temp[69])
VALUE7[i] = parse.(Float64, temp[70:74]) / 10 # day 7 of the month
MFLAG7[i] = string(temp[75])
QFLAG7[i] = string(temp[76])
SFLAG7[i] = string(temp[77])
VALUE8[i] = parse.(Float64, temp[78:82]) / 10 # day 8 of the month
MFLAG8[i] = string(temp[83])
QFLAG8[i] = string(temp[84])
SFLAG8[i] = string(temp[85])
VALUE9[i] = parse.(Float64, temp[86:90]) / 10 # day 9 of the month
MFLAG9[i] = string(temp[91])
QFLAG9[i] = string(temp[92])
SFLAG9[i] = string(temp[93])
VALUE10[i] = parse.(Float64, temp[94:98]) / 10 # day 10 of the month
MFLAG10[i] = string(temp[99])
QFLAG10[i] = string(temp[100])
SFLAG10[i] = string(temp[101])
VALUE11[i] = parse.(Float64, temp[102:106]) / 10 # day 11 of the month
MFLAG11[i] = string(temp[107])
QFLAG11[i] = string(temp[108])
SFLAG11[i] = string(temp[109])
VALUE12[i] = parse.(Float64, temp[110:114]) / 10 # day 12 of the month
MFLAG12[i] = string(temp[115])
QFLAG12[i] = string(temp[116])
SFLAG12[i] = string(temp[117])
VALUE13[i] = parse.(Float64, temp[118:122]) / 10 # day 13 of the month
MFLAG13[i] = string(temp[123])
QFLAG13[i] = string(temp[124])
SFLAG13[i] = string(temp[125])
VALUE14[i] = parse.(Float64, temp[126:130]) / 10 # day 14 of the month
MFLAG14[i] = string(temp[131])
QFLAG14[i] = string(temp[132])
SFLAG14[i] = string(temp[133])
VALUE15[i] = parse.(Float64, temp[134:138]) / 10 # day 15 of the month
MFLAG15[i] = string(temp[139])
QFLAG15[i] = string(temp[140])
SFLAG15[i] = string(temp[141])
VALUE16[i] = parse.(Float64, temp[142:146]) / 10 # day 16 of the month
MFLAG16[i] = string(temp[147])
QFLAG16[i] = string(temp[148])
SFLAG16[i] = string(temp[149])
VALUE17[i] = parse.(Float64, temp[150:154]) / 10 # day 17 of the month
MFLAG17[i] = string(temp[155])
QFLAG17[i] = string(temp[156])
SFLAG17[i] = string(temp[157])
VALUE18[i] = parse.(Float64, temp[158:162]) / 10 # day 18 of the month
MFLAG18[i] = string(temp[163])
QFLAG18[i] = string(temp[164])
SFLAG18[i] = string(temp[165])
VALUE19[i] = parse.(Float64, temp[166:170]) / 10 # day 19 of the month
MFLAG19[i] = string(temp[171])
QFLAG19[i] = string(temp[172])
SFLAG19[i] = string(temp[173])
VALUE20[i] = parse.(Float64, temp[174:178]) / 10 # day 20 of the month
MFLAG20[i] = string(temp[179])
QFLAG20[i] = string(temp[180])
SFLAG20[i] = string(temp[181])
VALUE21[i] = parse.(Float64, temp[182:186]) / 10 # day 21 of the month
MFLAG21[i] = string(temp[187])
QFLAG21[i] = string(temp[188])
SFLAG21[i] = string(temp[189])
VALUE22[i] = parse.(Float64, temp[190:194]) / 10 # day 22 of the month
MFLAG22[i] = string(temp[195])
QFLAG22[i] = string(temp[196])
SFLAG22[i] = string(temp[197])
VALUE23[i] = parse.(Float64, temp[198:202]) / 10 # day 23 of the month
MFLAG23[i] = string(temp[203])
QFLAG23[i] = string(temp[204])
SFLAG23[i] = string(temp[205])
VALUE24[i] = parse.(Float64, temp[206:210]) / 10 # day 24 of the month
MFLAG24[i] = string(temp[211])
QFLAG24[i] = string(temp[212])
SFLAG24[i] = string(temp[213])
VALUE25[i] = parse.(Float64, temp[214:218]) / 10 # day 25 of the month
MFLAG25[i] = string(temp[219])
QFLAG25[i] = string(temp[220])
SFLAG25[i] = string(temp[221])
VALUE26[i] = parse.(Float64, temp[222:226]) / 10 # day 26 of the month
MFLAG26[i] = string(temp[227])
QFLAG26[i] = string(temp[228])
SFLAG26[i] = string(temp[229])
VALUE27[i] = parse.(Float64, temp[230:234]) / 10 # day 27 of the month
MFLAG27[i] = string(temp[235])
QFLAG27[i] = string(temp[236])
SFLAG27[i] = string(temp[237])
VALUE28[i] = parse.(Float64, temp[238:242]) / 10 # day 28 of the month
MFLAG28[i] = string(temp[243])
QFLAG28[i] = string(temp[244])
SFLAG28[i] = string(temp[245])
VALUE29[i] = parse.(Float64, temp[246:250]) / 10 # day 29 of the month
MFLAG29[i] = string(temp[251])
QFLAG29[i] = string(temp[252])
SFLAG29[i] = string(temp[253])
VALUE30[i] = parse.(Float64, temp[254:258]) / 10 # day 30 of the month
MFLAG30[i] = string(temp[259])
QFLAG30[i] = string(temp[260])
SFLAG30[i] = string(temp[261])
VALUE31[i] = parse.(Float64, temp[262:266]) / 10 # day 31 of the month
MFLAG31[i] = string(temp[267])
QFLAG31[i] = string(temp[268])
SFLAG31[i] = string(temp[269])
end
# column names
colnames = ["ID","YEAR","MONTH","ELEMENT","VALUE1","MFLAG1","QFLAG1","SFLAG1","VALUE2","MFLAG2","QFLAG2","SFLAG2","VALUE3","MFLAG3","QFLAG3","SFLAG3","VALUE4","MFLAG4","QFLAG4","SFLAG4","VALUE5",
"MFLAG5","QFLAG5","SFLAG5","VALUE6","MFLAG6","QFLAG6","SFLAG6","VALUE7","MFLAG7","QFLAG7","SFLAG7","VALUE8","MFLAG8","QFLAG8","SFLAG8","VALUE9","MFLAG9","QFLAG9","SFLAG9","VALUE10","MFLAG10",
"QFLAG10","SFLAG10","VALUE11","MFLAG11","QFLAG11","SFLAG11","VALUE12","MFLAG12","QFLAG12","SFLAG12","VALUE13","MFLAG13","QFLAG13","SFLAG13","VALUE14","MFLAG14","QFLAG14","SFLAG14",
"VALUE15","MFLAG15","QFLAG15","SFLAG15","VALUE16","MFLAG16","QFLAG16","SFLAG16","VALUE17","MFLAG17","QFLAG17","SFLAG17","VALUE18","MFLAG18","QFLAG18","SFLAG18","VALUE19","MFLAG19",
"QFLAG19","SFLAG19","VALUE20","MFLAG20","QFLAG20","SFLAG20","VALUE21","MFLAG21","QFLAG21","SFLAG21","VALUE22","MFLAG22","QFLAG22","SFLAG22","VALUE23","MFLAG23","QFLAG23","SFLAG23",
"VALUE24","MFLAG24","QFLAG24","SFLAG24","VALUE25","MFLAG25","QFLAG25","SFLAG25","VALUE26","MFLAG26","QFLAG26","SFLAG26","VALUE27","MFLAG27","QFLAG27","SFLAG27","VALUE28","MFLAG28",
"QFLAG28","SFLAG28","VALUE29","MFLAG29","QFLAG29","SFLAG29","VALUE30","MFLAG30","QFLAG30","SFLAG30","VALUE31","MFLAG31","QFLAG31","SFLAG31"]
# output data frame
out_df = DataFrame(hcat(ID,YEAR,MONTH,ELEMENT,VALUE1,MFLAG1,QFLAG1,SFLAG1,VALUE2,MFLAG2,QFLAG2,SFLAG2,VALUE3,MFLAG3,QFLAG3,SFLAG3,VALUE4,MFLAG4,QFLAG4,SFLAG4,VALUE5,
MFLAG5,QFLAG5,SFLAG5,VALUE6,MFLAG6,QFLAG6,SFLAG6,VALUE7,MFLAG7,QFLAG7,SFLAG7,VALUE8,MFLAG8,QFLAG8,SFLAG8,VALUE9,MFLAG9,QFLAG9,SFLAG9,VALUE10,MFLAG10,
QFLAG10,SFLAG10,VALUE11,MFLAG11,QFLAG11,SFLAG11,VALUE12,MFLAG12,QFLAG12,SFLAG12,VALUE13,MFLAG13,QFLAG13,SFLAG13,VALUE14,MFLAG14,QFLAG14,SFLAG14,
VALUE15,MFLAG15,QFLAG15,SFLAG15,VALUE16,MFLAG16,QFLAG16,SFLAG16,VALUE17,MFLAG17,QFLAG17,SFLAG17,VALUE18,MFLAG18,QFLAG18,SFLAG18,VALUE19,MFLAG19,
QFLAG19,SFLAG19,VALUE20,MFLAG20,QFLAG20,SFLAG20,VALUE21,MFLAG21,QFLAG21,SFLAG21,VALUE22,MFLAG22,QFLAG22,SFLAG22,VALUE23,MFLAG23,QFLAG23,SFLAG23,
VALUE24,MFLAG24,QFLAG24,SFLAG24,VALUE25,MFLAG25,QFLAG25,SFLAG25,VALUE26,MFLAG26,QFLAG26,SFLAG26,VALUE27,MFLAG27,QFLAG27,SFLAG27,VALUE28,MFLAG28,
QFLAG28,SFLAG28,VALUE29,MFLAG29,QFLAG29,SFLAG29,VALUE30,MFLAG30,QFLAG30,SFLAG30,VALUE31,MFLAG31,QFLAG31,SFLAG31))
# rename data frame
rename!(out_df, colnames)
# data statistics
# subset TMAX/TMIN/TAVG Elements
tmax = out_df[out_df.ELEMENT .== "TMAX", :]
tmin = out_df[out_df.ELEMENT .== "TMIN", :]
tavg = out_df[out_df.ELEMENT .== "TAVG", :]
month_index = vcat(1,2,3,4,collect(5:4:128)) # print(vcat(1,2,3,4,collect(5:4:128)))
tmax_mo = tmax[:,[1, 2, 3, 4, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 65, 69, 73, 77, 81, 85, 89, 93, 97, 101, 105, 109, 113, 117, 121, 125]]
tmin_mo = tmin[:,[1, 2, 3, 4, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 65, 69, 73, 77, 81, 85, 89, 93, 97, 101, 105, 109, 113, 117, 121, 125]]
tavg_mo = tavg[:,[1, 2, 3, 4, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, 65, 69, 73, 77, 81, 85, 89, 93, 97, 101, 105, 109, 113, 117, 121, 125]]
# build date index for the set
days_total = 31
#day_index = repeat(vcat(["01","02","03","04","05","06","07","08","09"],string.(collect(10:1:31))),inner = size(tmax,1))
day_index = vcat(["01","02","03","04","05","06","07","08","09"],string.(collect(10:1:days_total)))
# Loop through creating date index for the data and pull the TMAX temperature data
dims_tmax = size(tmax,1) * days_total
out_tmax = fill("", dims_tmax) # push to
data_out_tmax = zeros(dims_tmax) # push to
data_grab = [] # temp for loop
temp = [] # temp for loop
count = 0
count1 = 0
for j = 5:size(tmax_mo,2) # loop along columns
#print("this is fucking JJJ",j,"yee\n")
count = count +1
for i = 1 :size(tmax_mo,1) # loop down the rows
count1 = count1+1
if tmax_mo[i,3] == "02" && (day_index[count] == "29" || day_index[count] == "30" || day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tmax_mo[i,3] == "04" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tmax_mo[i,3] == "06" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tmax_mo[i,3] == "09" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tmax_mo[i,3] == "11" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tmax_mo[i,3] != "01" || tmax_mo[i,3] != "04" || tmax_mo[i,3] != "06" || tmax_mo[i,3] != "09" || tmax_mo[i,3] != "11"
temp = join([tmax_mo[i,2],tmax_mo[i,3],day_index[count]])
data_grab = tmax_mo[i,j]
end
#push!(out_tmax,temp)
#push!(data_out_tmax,data_grab)
out_tmax[count1] = temp
data_out_tmax[count1] = data_grab
end
end
dims_tmin = size(tmin,1) * days_total
out_tmin = fill("", dims_tmin) # push to
data_out_tmin = zeros(dims_tmin) # push to
data_grab = [] # temp for loop
temp = [] # temp for loop
count = 0
count1 = 0
for j = 5:size(tmin_mo,2) # loop along columns
#print("this is fucking JJJ",j,"yee\n")
count = count +1
for i = 1 :size(tmin_mo,1) # loop down the rows
count1 = count1+1
if tmin_mo[i,3] == "02" && (day_index[count] == "29" || day_index[count] == "30" || day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tmin_mo[i,3] == "04" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tmin_mo[i,3] == "06" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tmin_mo[i,3] == "09" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tmin_mo[i,3] == "11" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tmin_mo[i,3] != "01" || tmin_mo[i,3] != "04" || tmin_mo[i,3] != "06" || tmin_mo[i,3] != "09" || tmin_mo[i,3] != "11"
temp = join([tmin_mo[i,2],tmin_mo[i,3],day_index[count]])
data_grab = tmin_mo[i,j]
end
out_tmin[count1] = temp
data_out_tmin[count1] = data_grab
end
end
dims_avg = size(tavg,1) * days_total
if dims_avg != 0
out_tavg = fill("", dims_avg) # push to
data_out_tavg = zeros(dims_avg) # push to
data_grab = [] # temp for loop
temp = [] # temp for loop
count = 0
count1 = 0
for j = 5:size(tavg_mo,2) # loop along columns
#print("this is fucking JJJ",j,"yee\n")
count = count +1
for i = 1 :size(tavg_mo,1) # loop down the rows
count1 = count1+1
if tavg_mo[i,3] == "02" && (day_index[count] == "29" || day_index[count] == "30" || day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tavg_mo[i,3] == "04" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tavg_mo[i,3] == "06" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tavg_mo[i,3] == "09" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tavg_mo[i,3] == "11" && (day_index[count] == "31")
temp = ""
data_grab = 99999.999
elseif tavg_mo[i,3] != "01" || tavg_mo[i,3] != "04" || tavg_mo[i,3] != "06" || tavg_mo[i,3] != "09" || tavg_mo[i,3] != "11"
temp = join([tavg_mo[i,2],tavg_mo[i,3],day_index[count]])
data_grab = tavg_mo[i,j]
end
out_tavg[count1] = temp
data_out_tavg[count1] = data_grab
end
end
end
# subset to remove missing data for the shorter months
out_tmax = out_tmax[out_tmax .!= ""] #
data_out_tmax = data_out_tmax[data_out_tmax .!= 99999.999]
out_tmin = out_tmin[out_tmin .!= ""] #
data_out_tmin = Float64.(data_out_tmin[data_out_tmin .!= 99999.999])
if dims_avg != 0
out_tavg = out_tavg[out_tavg .!= ""] #
data_out_tavg = Float64.(data_out_tavg[data_out_tavg .!= 99999.999])
end
# convert date to date format
out_tmax = Date.(out_tmax, "yyyymmdd")
out_tmin = Date.(out_tmin, "yyyymmdd")
if dims_avg != 0
out_tavg = Date.(out_tavg, "yyyymmdd")
end
# create final ordered by date dataframe
out_tmax = DataFrame(Date = out_tmax, TMAX = data_out_tmax)
final_tmax = sort(out_tmax,[:Date])
out_tmin = DataFrame(Date = out_tmin, TMIN = data_out_tmin)
final_tmin = sort(out_tmin,[:Date])
if dims_avg != 0
out_tavg = DataFrame(Date = out_tavg, TAVG = data_out_tavg)
final_tavg = sort(out_tavg,[:Date])
end
#######################################################
# left join to date index
#######################################################
#=test_date = final_tmax[1:5,1]
dummy_temp = [1.0,1.0,1.0,1.0,1.0]
df1 = DataFrame(Date = test_date, TMAX = dummy_temp)
global date_index_tmax = leftjoin(date_index_tmax, final_tmax, on = :Date, makeunique=true)
global date_index_tmin = leftjoin(date_index_tmin, final_tmin, on = :Date, makeunique=true)
=#
# Build output data frames (all parsed from fixed width .dly file)
#global all_out_tmax = vcat(all_out_tmax,final_tmax)
#global all_out_tmin = vcat(all_out_tmin,final_tmin)
#global all_out_tavg = vcat(all_out_tavg,final_tavg)
# write parsed .csv files to file
CSV.write(join(["C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tmax_us/",us_dly_files_for_export[d],".csv"]),final_tmax)
CSV.write(join(["C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tmin_us/",us_dly_files_for_export[d],".csv"]),final_tmin)
if dims_avg != 0
CSV.write(join(["C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tavg_us/",us_dly_files_for_export[d],".csv"]),final_tavg)
end
end # loop
# validation
#CSV.write("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/tmin_mo.csv",tmax_mo)
#CSV.write("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/data_per_dateTMIN.csv",final_tmax)
# read ghcnd-stations.txt
ghcn_stations = CSV.read("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd-stations.txt", DataFrame, header=false, delim=' ', ignorerepeated=true)
station_ID = ghcn_stations[:,1]
station_ID_chop = fill("",size(ghcn_stations,1))
US_stations = fill("",size(ghcn_stations,1))
for i = 1:size(ghcn_stations,1)
station_ID_chop[i] = station_ID[i][1:2] # subset first two of the ID
if station_ID_chop[i] == "US"
US_stations[i] = join([station_ID[i], ".csv"])
end
end
unique(US_stations)
US_stations = DataFrame(hcat(US_stations[US_stations .!= ""]))
# Read .csv data and creae one array for statistics
# read all files in directory
csv_dir_tmax = DataFrame(hcat(cd(readdir, "C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tmax")))
csv_dir_tmin = DataFrame(hcat(cd(readdir, "C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tmin")))
csv_dir_tavg = DataFrame(hcat(cd(readdir, "C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tavg")))
# subset US stations
csv_dir_tmax = innerjoin(US_stations, csv_dir_tmax, on = :x1)
csv_dir_tmin = innerjoin(US_stations, csv_dir_tmin, on = :x1)
csv_dir_tavg = innerjoin(US_stations, csv_dir_tavg, on = :x1)
csv_dir_tmax = csv_dir_tmax[:,1]
csv_dir_tmin = csv_dir_tmin[:,1]
csv_dir_tavg = csv_dir_tavg[:,1]
# get the dimensions of each .csv files
tmax_all_total_output_size = fill(0, size(csv_dir_tmax,1))
tmin_all_total_output_size = fill(0, size(csv_dir_tmin,1))
tavg_all_total_output_size = fill(0, size(csv_dir_tavg,1))
tmax_dim = Int64[]
tmin_dim = Int64[]
tavg_dim = Int64[]
c=1
for c = 1:length(csv_dir_tmin) # 47484 - last point
print("Starting csv file ", csv_dir_tmin[c]," - Iteration ",c,"\n")
if c <= length(csv_dir_tmax)
tmax_csv = CSV.read(join(["C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tmax/", csv_dir_tmax[c] ]), DataFrame, header=true)
global tmax_dim = size(tmax_csv,1)
tmax_all_total_output_size[c] = tmax_dim
end
if c <= length(csv_dir_tmin)
tmin_csv = CSV.read(join(["C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tmin/", csv_dir_tmin[c]]), DataFrame, header=true)
global tmin_dim = size(tmin_csv,1)
tmin_all_total_output_size[c] = tmin_dim
end
if c <= length(csv_dir_tavg)
tavg_csv = CSV.read(join(["C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tavg/", csv_dir_tavg[c]]), DataFrame, header=true)
global tavg_dim = size(tavg_csv,1)
tavg_all_total_output_size[c] = tavg_dim
end
end
# sum total dimension of all .csv files
tmax_sum = sum(tmax_all_total_output_size)
tmin_sum = sum(tmin_all_total_output_size)
tavg_sum = sum(tavg_all_total_output_size)
# initialize final output to total dimension
tmax_date_array = fill(Date("13000101", "yyyymmdd"),tmax_sum)
tmax_array = zeros(tmax_sum)
tmin_date_array = fill(Date("13000101", "yyyymmdd"),tmin_sum)
tmin_array = zeros(tmin_sum)
tavg_date_array = fill(Date("13000101", "yyyymmdd"),tavg_sum)
tavg_array = zeros(tavg_sum)
# initialize outputs
tmax_all = DataFrame(Date = tmax_date_array, TMAX = tmax_array)
tmin_all = DataFrame(Date = tmin_date_array, TMIN = tmin_array)
tavg_all = DataFrame(Date = tavg_date_array, TAVG = tavg_array)
tmax_count = 0
tmin_count = 0
tavg_count = 0
c=1
i = 1
for c = 1:length(csv_dir_tmin) # 47484 - last point # loop along the files
print("Starting csv file ", csv_dir_tmin[c]," - Iteration ",c,"\n")
if c <= length(csv_dir_tmax)
csv_tmax = CSV.read(join(["C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tmax/", csv_dir_tmax[c]]), DataFrame, header=true)
if size(csv_tmax,1) != 0
for i = 1:size(csv_tmax,1) # loop along the file index
tmax_count = tmax_count +1
tmax_all[tmax_count,1] = csv_tmax[i,1] # fill date
tmax_all[tmax_count,2] = csv_tmax[i,2] # fill tmax
end
end
end
if c <= length(csv_dir_tmin)
csv_tmin = CSV.read(join(["C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tmin/", csv_dir_tmin[c]]), DataFrame, header=true)
if size(csv_tmin,1) != 0
for i = 1:size(csv_tmin,1) # loop along the file index
tmin_count = tmin_count +1
# print("this count should be continuous", tmin_count)
tmin_all[tmin_count,1] = csv_tmin[i,1] # fill date
tmin_all[tmin_count,2] = csv_tmin[i,2] # fill tmax
end
end
end
if c <= length(csv_dir_tavg)
csv_tavg = CSV.read(join(["C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tavg/", csv_dir_tavg[c]]), DataFrame, header=true)
if size(csv_tavg,1) != 0
for i = 1:size(csv_tavg,1) # loop along the file index
tavg_count = tavg_count +1
tavg_all[tavg_count,1] = csv_tavg[i,1] # fill date
tavg_all[tavg_count,2] = csv_tavg[i,2] # fill tmax
end
end
end
end
# Bogil solution
tmax_all_test = reduce(vcat, [CSV.read("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/ghcnd_all_csv/tmax/$x", DataFrame) for x in csv_dir_tmax]) # 235496952×2
# to test
# save .csv
#=
CSV.write("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/tmax_all.csv",tmax_all)
CSV.write("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/tmin_all.csv",tmin_all)
CSV.write("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/tavg_all.csv",tavg_all)
=#
# read data
tmax_all = CSV.read("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/tmax_all.csv", DataFrame,header=true)
tmin_all = CSV.read("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/tmin_all.csv", DataFrame,header=true)
tavg_all = CSV.read("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/tavg_all.csv", DataFrame,header=true)
# add month / year columns to dataframe
tmax_all.year = year.(tmax_all[:,1])
tmax_all.month = month.(tmax_all[:,1])
tmin_all.year = year.(tmin_all[:,1])
tmin_all.month = month.(tmin_all[:,1])
tavg_all.year = year.(tavg_all[:,1])
tavg_all.month = month.(tavg_all[:,1])
# subset to remove missing data -999.9 - 99.9
tmax_sub = tmax_all[tmax_all.TMAX .!= -999.9, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -99.99, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -573.3, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -409.5, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -408.1, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -406.5, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -405.5, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -404.3, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -402.5, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -401.8, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -400.9, :]
tmax_sub = tmax_sub[tmax_sub.TMAX .!= -399.6, :]
minimum(tmax_sub.TMAX)
tmin_sub = tmin_all[tmin_all.TMIN .!= -999.9, :]
tmin_sub = tmin_sub[tmin_sub.TMIN .!= -99.99, :]
minimum(tmax_sub.TMIN)
tavg_sub = tavg_all[tavg_all.TAVG .!= -999.9, :]
tavg_sub = tavg_sub[tavg_sub.TAVG .!= -99.99, :]
minimum(tavg_sub.TAVG)
# group by for statistics
#tmax - all mean
gdf = groupby(tmax_sub, [:Date])
tmax_mean_out = combine(gdf, :TMAX => mean)
tmax_mean_out = sort(tmax_mean_out,[:Date])
roll_mean_tmax = zeros(size(tmax_mean_out,1))
n=5
for i = n:size(tmax_mean_out,1)
roll_mean_tmax[i] = mean(tmax_mean_out.TMAX_mean[i-n+1:i])
end
#tmax - by year
gdf = groupby(tmax_sub, [:year])
tmax_mean_out = combine(gdf, :TMAX => mean)
tmax_mean_out = sort(tmax_mean_out,[:year])
roll_mean_tmax = zeros(size(tmax_mean_out,1))
n=5
for i = n:size(tmax_mean_out,1)
roll_mean_tmax[i] = mean(tmax_mean_out.TMAX_mean[i-n+1:i])
end
#tmax - by year / month
gdf = groupby(tmax_sub, [:year, :month])
tmax_mean_out = combine(gdf, :TMAX => mean)
y_out = string.(tmax_mean_out.year)
m_out = string.(tmax_mean_out.month)
date_index_sub = fill("", size(tmax_mean_out,1))
for i = 1:size(date_index_sub,1)
date_index_sub[i] = join([y_out[i],m_out[i]])
end
tmax_mean_out.date_index_sub = Date.(date_index_sub, "yyyymm")
tmax_mean_out = sort(tmax_mean_out,[:date_index_sub])
#tmin - daily average
gdf = groupby(tmin_sub, [:Date])
tmin_mean_out = combine(gdf, :TMIN => mean)
tmin_mean_out = sort(tmin_mean_out,[:Date])
#tmin - by year
gdf = groupby(tmin_sub, [:year])
tmin_mean_out = combine(gdf, :TMIN => mean)
tmin_mean_out = sort(tmin_mean_out,[:year])
roll_mean_tmin = zeros(size(tmin_mean_out,1))
n=5
for i = n:size(tmin_mean_out,1)
roll_mean_tmin[i] = mean(tmin_mean_out.TMIN_mean[i-n+1:i])
end
#tmin - by year / month
gdf = groupby(tmin_sub, [:year, :month])
tmin_mean_out = combine(gdf, :TMIN => mean)
y_out = string.(tmin_mean_out.year)
m_out = string.(tmin_mean_out.month)
date_index_sub = fill("", size(tmin_mean_out,1))
for i = 1:size(date_index_sub,1)
date_index_sub[i] = join([y_out[i],m_out[i]])
end
tmin_mean_out.date_index_sub = Date.(date_index_sub, "yyyymm")
tmin_mean_out = sort(tmin_mean_out,[:date_index_sub])
#tavg
gdf = groupby(tavg_sub, [:Date])
tavg_mean_out = combine(gdf, :TAVG => mean)
tavg_mean_out = sort(tavg_mean_out,[:Date])
#tavg - by year
gdf = groupby(tavg_sub, [:year])
tavg_mean_out = combine(gdf, :TAVG => mean)
tavg_mean_out = sort(tavg_mean_out,[:year])
roll_mean_tavg = zeros(size(tavg_mean_out,1))
n = 5
for i = n:size(tavg_mean_out,1)
roll_mean_tavg[i] = mean(tavg_mean_out.TAVG_mean[i-n+1:i])
end
maximum(tavg_mean_out.TAVG_mean)
#tavg- by year / month
gdf = groupby(tavg_sub, [:year, :month])
tavg_mean_out = combine(gdf, :TAVG => mean)
y_out = string.(tavg_mean_out.year)
m_out = string.(tavg_mean_out.month)
date_index_sub = fill("", size(tavg_mean_out,1))
for i = 1:size(date_index_sub,1)
date_index_sub[i] = join([y_out[i],m_out[i]])
end
tavg_mean_out.date_index_sub = Date.(date_index_sub, "yyyymm")
tavg_mean_out = sort(tavg_mean_out,[:date_index_sub])
# difference betwee group by year - max - min
diff = tmax_mean_out.TMAX_mean .- tmin_mean_out.TMIN_mean
# Plot
# difference group by year - MAX - MIN
plot(tmin_mean_out.year, diff, legend = false, title = "All US Stations - Annual Average of Difference MAX-MIN Daily Temperature", titlefontsize = 10)
#tmax
plot(tmax_mean_out.Date, tmax_mean_out.TMAX_mean, legend = false, title = "All US Stations - Average of MAX Daily Temperature")
plot!(tmax_mean_out.Date,roll_mean_tmax)
# tmax group by year plot
plot(tmax_mean_out.year, tmax_mean_out.TMAX_mean, legend = false, title = "All US Stations - Annual Average of MAX Daily Temperature")
# tmax group by year & month plot
plot(tmax_mean_out.date_index_sub, tmax_mean_out.TMAX_mean, legend = false, title = "All US Stations - Monthly Average of MAX Daily Temperature")
plot!(tmax_mean_out.date_index_sub,roll_mean_tmax)
#tmin
plot(tmin_mean_out.Date, tmin_mean_out.TMIN_mean, legend = false, title = "All US Stations - Average of MIN Daily Temperature")
plot!(tmin_mean_out.Date,roll_mean_tmin)
# tmin group by year plot
plot(tmin_mean_out.year, tmin_mean_out.TMIN_mean, legend = false, title = "All US Stations - Annual Average of MIN Daily Temperature")
# tmin ggroup by year & month plot
plot(tmin_mean_out.month, tmin_mean_out.TMIN_mean, legend = false, title = "All US Stations - Monthly Average of MIN Daily Temperature")
#tavg
plot(tavg_mean_out.Date, tavg_mean_out.TAVG_mean, legend = false, title = "All US Stations - Average of AVG Daily Temperature")
plot!(tavg_mean_out.Date,roll_mean_tavg)
# tavg group by year plot
plot(tavg_mean_out.year, tavg_mean_out.TAVG_mean, legend = false, title = "All US Stations - Annual Average of Daily Average (TAVG) Temperature",titlefontsize = 10)
plot!(tavg_mean_out.year ,roll_mean_tavg)
# tmin group by year & month plot
plot(tavg_mean_out.date_index_sub, tavg_mean_out.TAVG_mean, legend = false, title = "All US Stations - Monthly Average of Average Daily Temperature",titlefontsize = 10)
# Loop through the CSV data left joing to the master index date - extract tmin / tmax data
# http://spatialreasoning.com/wp/20170307_1244_r-reading-filtering-weather-data-from-the-global-historical-climatology-network-ghcn
tmax_mo
# write csv
CSV.write("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/all_out_tmax.csv",all_out_tmax)
CSV.write("C:/Users/andrew.bannerman/Desktop/Julia/scripts/GHCN data/data_per_date.csv",final_tmax)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment