Created
June 22, 2016 06:34
-
-
Save arthurwuhoo/44117aa15128080c6767a22a50ee8c62 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################ | |
# Reading in 2015-2016 Weather Data for Cape Town | |
################################################################ | |
library(rvest) | |
tables <- read_html("https://www.wunderground.com/history/airport/FACT/2015/6/21/CustomHistory.html?dayend=21&monthend=6&yearend=2016&req_city=&req_state=&req_statename=&reqdb.zip=&reqdb.magic=&reqdb.wmo=") | |
raw_weather <- tables %>% html_nodes(css="#obsTable") %>% .[[1]] %>% html_table(header = TRUE, fill = TRUE) | |
dim(raw_weather) #looks like about 365 days that we want with 21 variables | |
#but there are issues where the two headers for each month are creating errors. | |
#we want to 1) create a consistent date column and 2) remove extraneous headers | |
weather <- raw_weather[which(raw_weather[,2]!="high"),] | |
#gets rid of the second recurring header | |
weather <- weather[(weather[,21]!="Events"),] | |
#gets rid of the first recurring header | |
### get the column names looking right: | |
colnames(weather) <- c("Day","TempHigh","TempAvg","TempLow", | |
"DewHigh","DewAvg","DewLow", | |
"HumidHigh","HumidAvg","HumidLow", | |
"PressureHigh","PressureAvg","PressureLow", | |
"VisHigh","VisAvg","VisLow", | |
"WindHigh","WindAvg","WindLow", | |
"Precip","Events") | |
weather_num <- as.data.frame(lapply(weather[,1:20], as.numeric)) | |
weather_final <- cbind(weather_num, factor(weather$Events)) | |
str(weather_final) | |
### now, we need to get the dates in order. | |
length(weather_final$Day) | |
dates <- seq(as.POSIXct("6/21/2015", format = "%m/%d/%Y"), as.POSIXct("6/21/2016", format = "%m/%d/%Y"), "days") | |
weather_final$Date <- dates | |
colSums(as.data.frame(lapply(weather_final,is.na))) #shows NA counts per variable. | |
#could be worth excluding windlow and visibility | |
library(dplyr) | |
plot(weather_final$Date,weather_final$TempAvg) #looks about right | |
title("Average Temperature in Cape Town, 6/2015 - 6/2016") | |
library(ggplot2) | |
ggplot(weather_final, x = Date, y = TempAvg, aes(x = Date, y = TempAvg, color = Precip)) + geom_point() | |
#I'd recommend engineering a binary rain variable. | |
#Now, you should have the weather data you need to link to sales. | |
#Some ideas: | |
#Use dplyr's inner-join function on the weather data and sales by day. | |
#Is there a correlation? | |
#Do some products sell better on rainy days? | |
#Do some products sell better on rainy and cold days? | |
#Go forth! |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment