Created
December 3, 2017 01:04
-
-
Save mkiang/79b4bbef489a772630d3c6c1da500184 to your computer and use it in GitHub Desktop.
Weather plots
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## For blog post here: https://mathewkiang.com/2017/12/02/tldr-san-diego-weather-is-better-than-boston-weather/ | |
## Imports ---- | |
library(tidyverse) | |
library(GSODR) | |
library(ggmap) | |
## Define places we are interested in and google maps query ---- | |
city_dict <- list(sandiego = "san diego airport", | |
boston = "boston logan") | |
## Query google maps for latitude longitude ---- | |
ll <- ggmap::geocode(unlist(city_dict, use.names = FALSE)) | |
## Turn geocode query into a dataframe ---- | |
cities <- data.frame(list(city = names(city_dict), | |
query = unlist(city_dict, use.names = FALSE), | |
lon = ll$lon, | |
lat = ll$lat), | |
stringsAsFactors = FALSE) | |
## Find the stations within 10 miles of our geocode ---- | |
city_station <- list() | |
for (r in 1:nrow(cities)) { | |
print(cities[r, 'search_q']) | |
temp_station <- nearest_stations(LAT = cities[r, 'lat'], | |
LON = cities[r, 'lon'], | |
distance = 10 * 1/.62) | |
city_station[[cities[r, 'city']]] <- temp_station | |
rm(temp_station) | |
} | |
## Join stations to our city/geocode dataframe ---- | |
cs_df <- NULL | |
for (i in seq_along(city_station)) { | |
temp_cs <- cbind(city = rep(names(city_station)[i], | |
length(city_station[[i]])), | |
stnid = city_station[[i]]) | |
cs_df <- rbind(cs_df, temp_cs) | |
rm(temp_cs) | |
} | |
cities <- left_join(cities, | |
as_data_frame(cs_df), | |
by = "city") | |
## Save cities dataframe ---- | |
dir.create(path = './data', showWarnings = FALSE, recursive = TRUE) | |
write_csv(cities, path = "./data/cities_df.csv") | |
## Download all cities for ~60 years and save the raw file ---- | |
weather_df <- get_GSOD(years = 1950:2017, station = cities$stnid) | |
write_csv(weather_df, path = "./data/raw_weather_data.csv.gz") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## For blog post here: https://mathewkiang.com/2017/12/02/tldr-san-diego-weather-is-better-than-boston-weather/ | |
## Imports ---- | |
library(tidyverse) | |
library(GSODR) | |
library(ggmap) | |
library(RColorBrewer) | |
## Helpers ---- | |
## Convert C to F | |
c_to_f <- function(temp) { | |
return(temp * 9/5 + 32) | |
} | |
## For x-axis | |
days_in_mo <- function() { | |
return(c(31, 28, 31, 30, 31, 30, 31, | |
31, 30, 31, 30, 31)) | |
} | |
## For hlines | |
seq_gen <- function(df) { | |
lo <- floor(min(df$min_temp, na.rm = TRUE) / 10) * 10 | |
hi <- ceiling(max(df$max_temp, na.rm = TRUE) / 10) * 10 | |
return(seq(lo, hi, 10)) | |
} | |
## Plotting theme | |
mk_weather <- function(...) { | |
## Colors — stick with the ggplot2() greys | |
c_bg <- "white" | |
c_grid <- "grey50" | |
c_btext <- "grey5" | |
c_mtext <- "grey30" | |
# Begin construction of chart | |
theme_bw(base_size = 12, base_family = "Arial Narrow") + | |
# Region | |
theme(panel.background = element_rect(fill = c_bg, color = c_bg), | |
plot.background = element_rect(fill = c_bg, color = c_bg), | |
panel.border = element_rect(color = c_bg)) + | |
# Grid | |
theme(panel.grid.major = element_blank(), | |
panel.grid.minor = element_blank(), | |
axis.ticks = element_blank(), | |
axis.line.x = element_blank(), | |
axis.ticks.x = element_blank(), | |
axis.ticks.length = unit(0, "cm")) + | |
# Legend | |
theme(legend.position = "none", | |
legend.justification = c(0, 1), | |
legend.direction = "vertical", | |
legend.key = element_rect(fill = NA, color = NA), | |
legend.background = element_rect(fill = "transparent", color = NA), | |
legend.text = element_text(color = c_mtext)) + | |
# Titles, labels, etc. | |
theme(plot.title = element_text(color = c_btext, vjust = 1.25, | |
face = "bold", size = 18), | |
axis.text = element_text(size = 10, color = c_mtext), | |
axis.text.x = element_text(size = 10, color = c_mtext, | |
hjust = .5), | |
axis.title.x = element_text(size = 12, color = c_mtext, | |
hjust = 1), | |
axis.title.y = element_text(size = 12, color = c_mtext, | |
hjust = 1)) + | |
# Facets | |
theme(strip.background = element_rect(fill = "white", color = "black"), | |
strip.text = element_text(size = 10, color = c_btext)) + | |
# Plot margins | |
theme(plot.margin = unit(c(0.35, .2, 0.3, 0.35), "cm")) + | |
# Additionals | |
theme(...) | |
} | |
## Read in data: assumes you ran `01_get_data.R` ---- | |
cities <- read_csv('./data/cities_data.csv') | |
weather_df <- read_csv('./data/raw_weather_data.csv.gz') | |
## Remove columns we won't use, rename, clean up, etc. ---- | |
## Also filter out leap days because it's easier and I'm lazy. | |
sub_df <- weather_df %>% | |
tbl_df() %>% | |
setNames(tolower(names(.))) %>% | |
select(stnid, stn_name, country = ctry, state, lat, lon, | |
year, month, day, temp, visib, wdsp, max_temp = max, | |
min_temp = min, prcp, prcp_flag, rh) %>% | |
left_join(select(cities, stnid, city, city_lon = lon, city_lat = lat), | |
by = "stnid") %>% | |
mutate(month = as.integer(month), | |
day = as.integer(day), | |
year = as.integer(year), | |
prcp = as.numeric(prcp)) %>% | |
filter(!(day == 29 & month == 2)) %>% | |
group_by(stnid, year) %>% | |
arrange(year, month, day) %>% | |
mutate(doy = 1:n()) | |
## Now get the things we will need for the NYTimes graph: ---- | |
## (1) Record high/low by city | |
## (2) Daily average high/low by city | |
## (2) High and lows for 2017 | |
## (3) Days in 2017 that were record high/low | |
## (3) Weather for 2017 | |
weather_2017 <- sub_df %>% | |
filter(year == 2017) %>% | |
group_by(city, doy) %>% | |
select(year_max = max_temp, | |
year_min = min_temp, | |
doy, city) %>% | |
summarize(year_max = c_to_f(max(year_max, na.rm = TRUE)), | |
year_min = c_to_f(min(year_min, na.rm = TRUE))) | |
## (1), (2), and (4) | |
by_city_summary <- sub_df %>% | |
group_by(city, doy) %>% | |
summarize(mean_hi = c_to_f(mean(max_temp, na.rm = TRUE)), | |
mean_lo = c_to_f(mean(min_temp, na.rm = TRUE)), | |
max_temp = c_to_f(max(max_temp, na.rm = TRUE)), | |
min_temp = c_to_f(min(min_temp, na.rm = TRUE)), | |
stn_name = first(stn_name), | |
stnid = first(stnid), | |
county = first(country), | |
state = first(state), | |
lat = first(lat), | |
lon = first(lon), | |
city_lon = first(city_lon), | |
city_lat = first(city_lat)) | |
by_city_summary <- by_city_summary %>% | |
left_join(weather_2017, by = c("doy", "city")) %>% | |
mutate(record = ifelse(year_max == max_temp, year_max, | |
ifelse(year_min == min_temp, year_min, NA)), | |
record_col = ifelse(year_max == max_temp, "record high", | |
ifelse(year_min == min_temp, "record low", NA)), | |
record_y = ifelse(record_col == "record high", year_max + 3.5, | |
ifelse(record_col == "record low", year_min - 3.5, | |
NA)), | |
record_yend = ifelse(record_col == "record high", year_max + .5, | |
ifelse(record_col == "record low", year_min - .5, | |
NA))) | |
## San Diego plot ---- | |
sd_data <- filter(by_city_summary, city == "sandiego") | |
sdplot <- ggplot(sd_data, aes(x = doy)) + | |
geom_vline(xintercept = cumsum(c(days_in_mo()[-12])), | |
color = "black", alpha = .5, linetype = "dotted") + | |
geom_linerange(aes(ymax = max_temp, ymin = min_temp), | |
color = "tan", alpha = .15, size = 1) + | |
geom_linerange(aes(ymax = mean_hi, ymin = mean_lo), | |
color = "plum4", alpha = .75, size = .75) + | |
geom_hline(yintercept = seq_gen(sd_data), | |
color = "white", alpha = 1) + | |
geom_linerange(aes(ymax = year_max, ymin = year_min), | |
color = "deeppink4", alpha = .75, size = .5) + | |
geom_vline(xintercept = c(0, 366), alpha = 1, color = "grey70", | |
linetype = "solid") + | |
geom_point(aes(y = record), color = "white", size = 2, alpha = .75) + | |
geom_point(aes(y = record), color = "black", size = 1.5, alpha = .75) + | |
mk_weather(axis.title.y.right = element_text(hjust = 1, angle = 90)) + | |
scale_y_continuous(expression("Temperature ("*degree*"F)"), | |
labels = function(x) { | |
parse(text = paste0(x, "*degree")) | |
}, | |
sec.axis = | |
sec_axis(trans = ~ (. -32) * 5/9, | |
labels = function(x) { | |
parse(text = paste0(x, "*degree")) | |
}, | |
name = expression("Temperature ("*degree*"C)")), | |
breaks = seq_gen(sd_data), | |
expand = c(0, 0)) + | |
scale_x_continuous(NULL, expand = c(0, 1.25), | |
breaks = days_in_mo()/2 + | |
cumsum(c(0, days_in_mo()[-12])), | |
labels = c("January", "February", "March", "April", | |
"May", "June", "July", "August", "September", | |
"October", "November", "December")) + | |
labs(title = "Temperature in San Diego, CA", | |
subtitle = paste0("Light brown bars represent the record high and low ", | |
"temperature for each day since 1950.", | |
'\nViolet bars represent the average daily high ', | |
'and low temperature.', | |
"\nPurple bars represent the daily high/low", | |
" observed in 2017.", | |
"\nBlack points represent record highs/lows that", | |
" were set or tied in 2017.")) | |
ggsave(sdplot, filename = "./san_diego.png", | |
height = 4, width = 6.5, scale = 1.35, dpi = 300) | |
## Boston ---- | |
bosdata <- filter(by_city_summary, city == "boston") | |
bosplot <- ggplot(bosdata, aes(x = doy)) + | |
geom_vline(xintercept = cumsum(c(days_in_mo()[-12])), | |
color = "black", alpha = .5, linetype = "dotted") + | |
geom_linerange(aes(ymax = max_temp, ymin = min_temp), | |
color = "tan", alpha = .15, size = 1) + | |
geom_linerange(aes(ymax = mean_hi, ymin = mean_lo), | |
color = "plum4", alpha = .75, size = 1) + | |
geom_hline(yintercept = seq_gen(bosdata), | |
color = "white", alpha = 1) + | |
geom_linerange(aes(ymax = year_max, ymin = year_min), | |
color = "deeppink4", alpha = .75, size = .5) + | |
geom_vline(xintercept = c(0, 366), alpha = 1, color = "grey70", | |
linetype = "solid") + | |
geom_point(aes(y = record), color = "white", size = 2, alpha = .75) + | |
geom_point(aes(y = record), color = "black", size = 1.5, alpha = .75) + | |
mk_weather(axis.title.y.right = element_text(hjust = 1, angle = 90)) + | |
scale_y_continuous(expression("Temperature ("*degree*"F)"), | |
labels = function(x) { | |
parse(text = paste0(x, "*degree")) | |
}, | |
sec.axis = | |
sec_axis(trans = ~ (. -32) * 5/9, | |
labels = function(x) { | |
parse(text = paste0(x, "*degree")) | |
}, | |
name = expression("Temperature ("*degree*"C)")), | |
breaks = seq_gen(bosdata), | |
expand = c(0, 0)) + | |
scale_x_continuous(NULL, | |
expand = c(0, 1.25), | |
breaks = days_in_mo()/2 + | |
cumsum(c(0, days_in_mo()[-12])), | |
labels = c("January", "February", "March", "April", | |
"May", "June", "July", "August", "September", | |
"October", "November", "December")) + | |
labs(title = "Temperature in Boston, MA", | |
subtitle = paste0("Light brown bars represent the record high and low ", | |
"temperature for each day since 1950.", | |
'\nViolet bars represent the average daily high ', | |
'and low temperature.', | |
"\nPurple bars represent the daily high/low", | |
" observed in 2017.", | |
"\nBlack points represent record highs/lows that", | |
" were set or tied in 2017.")) | |
ggsave(bosplot, filename = "./boston.png", | |
height = 4, width = 6.5, scale = 1.35, dpi = 300) | |
## Mean hi/lo: boston vs san diego ----- | |
sdvsbos <- ggplot(by_city_summary, | |
aes(x = doy, ymax = mean_hi, ymin = mean_lo, | |
color = city, fill = city)) + | |
geom_vline(xintercept = cumsum(c(days_in_mo()[-12])), | |
color = "black", alpha = .5, linetype = "dotted") + | |
geom_ribbon(aes(ymax = mean_hi, ymin = mean_lo), color = NA, alpha = .5) + | |
geom_hline(yintercept = seq(20, 90, 10), | |
color = "white", alpha = 1) + | |
geom_vline(xintercept = c(0, 366), alpha = 1, color = "grey70", | |
linetype = "solid") + | |
mk_weather(axis.title.y.right = element_text(hjust = 1, angle = 90)) + | |
scale_y_continuous(expression("Temperature ("*degree*"F)"), | |
labels = function(x) { | |
parse(text = paste0(x, "*degree")) | |
}, | |
sec.axis = | |
sec_axis(trans = ~ (. -32) * 5/9, | |
labels = function(x) { | |
parse(text = paste0(x, "*degree")) | |
}, | |
name = expression("Temperature ("*degree*"C)")), | |
breaks = seq(20, 90, 10), | |
expand = c(0, 0)) + | |
scale_x_continuous(NULL, | |
expand = c(0, 1.25), | |
breaks = days_in_mo()/2 + | |
cumsum(c(0, days_in_mo()[-12])), | |
labels = c("January", "February", "March", "April", | |
"May", "June", "July", "August", "September", | |
"October", "November", "December")) + | |
labs(title = "Mean daily high and low temperature", | |
subtitle = "Boston vs San Diego, 1950-2016") | |
ggsave(sdvsbos, filename = "./bostonvssandiego.png", | |
height = 4, width = 6.5, scale = 1.35, dpi = 300) | |
## Cumulative percipitation ---- | |
rain_df <- sub_df %>% | |
filter(grepl(pattern = "INTERNATIONAL AIRPORT", | |
stn_name)) %>% | |
group_by(stn_name, year) %>% | |
mutate(cumulative_prcp = cumsum(ifelse(is.na(prcp), 0, prcp)), | |
city_name = factor(city, | |
levels = c("boston", "sandiego"), | |
labels = c("Boston, MA", "San Diego, CA"), | |
ordered = TRUE)) | |
rain_plot <- ggplot(rain_df, | |
aes(x = doy, y = cumulative_prcp/100, group = year, color = -year)) + | |
geom_line(alpha = .35) + | |
facet_grid(~ city_name) + | |
geom_vline(xintercept = cumsum(c(days_in_mo()[-12])), | |
color = "black", alpha = .5, linetype = "dotted") + | |
geom_hline(yintercept = seq(0, 22.5, 5), | |
color = "white", alpha = 0) + | |
geom_vline(xintercept = c(0, 366), alpha = 1, color = "grey70", | |
linetype = "solid") + | |
mk_weather(axis.title.y.right = element_text(hjust = 1, angle = 90)) + | |
scale_y_continuous("Precipitation (in)", | |
sec.axis = | |
sec_axis(trans = ~ . * 2.54, | |
name = "Precipitation (cm)"), | |
expand = c(0, 0)) + | |
scale_x_continuous(NULL, | |
expand = c(0, 1.25), | |
breaks = days_in_mo()/2 + | |
cumsum(c(0, days_in_mo()[-12])), | |
labels = c("January", "February", "March", "April", | |
"May", "June", "July", "August", "September", | |
"October", "November", "December")) + | |
labs(title = "Cumulative precipitation by year", | |
subtitle = "Boston vs San Diego, 1950-2016") | |
ggsave(rain_plot, filename = "./cumulative_rain.png", | |
height = 4, width = 10, scale = 1.6, dpi = 300) | |
## Temperature difference from the day before ---- | |
test <- sub_df %>% | |
filter(grepl(pattern = "INTERNATIONAL AIRPORT", | |
stn_name)) %>% | |
group_by(stn_name, year) %>% | |
arrange(year, doy) %>% | |
mutate(temp_lag_1 = temp - lag(temp), | |
city_name = factor(city, | |
levels = c("boston", "sandiego"), | |
labels = c("Boston, MA", "San Diego, CA"), | |
ordered = TRUE)) | |
temperature_diff <- ggplot(test, | |
aes(x = doy, y = temp_lag_1, | |
group = year, color = -year)) + | |
geom_line(alpha = .15) + | |
facet_grid(~ city_name) + | |
geom_vline(xintercept = cumsum(c(days_in_mo()[-12])), | |
color = "black", alpha = .5, linetype = "dotted") + | |
geom_hline(yintercept = seq(-20, 20, 10), | |
color = "white", alpha = 1) + | |
geom_vline(xintercept = c(0, 366), alpha = 1, color = "grey70", | |
linetype = "solid") + | |
mk_weather(axis.title.y.right = element_text(hjust = 1, angle = 90)) + | |
scale_y_continuous(expression("Temperature ("*degree*"F)"), | |
labels = function(x) { | |
parse(text = paste0(x, "*degree")) | |
}, | |
breaks = seq(-20, 20, 5), | |
expand = c(0, 0)) + | |
scale_x_continuous(NULL, | |
expand = c(0, 1.25), | |
breaks = days_in_mo()/2 + | |
cumsum(c(0, days_in_mo()[-12])), | |
labels = c("January", "February", "March", "April", | |
"May", "June", "July", "August", "September", | |
"October", "November", "December")) + | |
labs(title = "Daily Temperature Change", | |
subtitle = "Boston vs San Diego, 1950-2016") | |
ggsave(temperature_diff, filename = "./temp_diff.png", | |
height = 4, width = 10, scale = 1.6, dpi = 200) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment