Last active
September 30, 2015 21:47
-
-
Save dggoldst/a31f6a64426a0ccb5ec3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#data from http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time | |
#Ask for these fields | |
#"DAY_OF_WEEK" (IN UI DayOfWeek) | |
#"FL_DATE" (FlightDate) | |
#"CARRIER" (Carrier) | |
#"ORIGIN_CITY_MARKET_ID" (OriginCityMarketID) | |
#"ORIGIN" (Origin) | |
#"CRS_DEP_TIME" (CRSDepTime) | |
#"DEP_DELAY" (DepDelay) | |
#"ARR_DELAY" (ArrDelay) | |
#save files with names in format 01_2013.csv.gz, 02_2013.csv.gz, etc. | |
#save them in a subdirectory called "database" | |
library(plyr) | |
library(tidyr) | |
library(lubridate) | |
library(ggplot2) | |
library(dplyr) | |
setwd("C:/Dropbox/Projects/20141105_OnTimeDeparture/") | |
#On first run do this | |
# paths=dir("database",pattern="_2013.csv",full.names=TRUE) | |
# names(paths)=basename(paths) | |
# #takes about 4 mins | |
# df=ldply(paths,read.csv) | |
# df$X=df$.id=NULL | |
# names(df)=c("day_of_week","date","carrier","origin_market_id","airport", | |
# "departure_hour","delay","arr_delay") | |
# | |
# save(df,file="2013flights.Rdata") | |
#takes only a few secs | |
load(file="2013flights.Rdata") | |
str(df) | |
df = df %>% | |
mutate( | |
day_of_week=factor(day_of_week,levels=c(1:7,9), | |
labels= | |
c("Monday", | |
"Tuesday", | |
"Wednesday", | |
"Thursday", | |
"Friday", | |
"Saturday", | |
"Sunday", | |
"Unknown" | |
)), | |
month=substr(date,6,7), | |
datenum=substr(date,9,10), | |
departure_hour = round(departure_hour/100,0), | |
delay=ifelse(delay<0,0,delay), | |
arr_delay=ifelse(arr_delay<0,0,arr_delay)) %>% | |
filter(departure_hour > 5 & departure_hour< 24) | |
###Arrival and departure delays as fun of departure time | |
plot_data = df %>% | |
gather(delay_type,newdelay,delay:arr_delay) %>% | |
mutate(delay_type = ifelse(delay_type=="delay","Departure Delay","Arrival Delay")) %>% | |
group_by(departure_hour,delay_type) %>% | |
dplyr::summarise(mu=mean(newdelay,na.rm=TRUE), | |
se=sqrt(var(newdelay,na.rm=TRUE)/length(na.omit(newdelay))), | |
obs=length(na.omit(newdelay))) | |
#MS: W00t! | |
p=ggplot(plot_data,aes(x=departure_hour,y=mu,min=mu-se,max=mu+se,group=delay_type,color=delay_type)) + | |
geom_line() + | |
geom_point() + | |
geom_errorbar(width=.33) + | |
scale_x_continuous(breaks=seq(6,23)) + | |
labs(x="Hour of Day",y="Average Delay (Minutes)",title="Flight Delays by Departure Time") + | |
theme(legend.position="bottom") + | |
scale_color_discrete(name="Delay Type") | |
p | |
ggsave(plot=p,file="Flight_Delays_By_Hour_DelayType.pdf",width=6,height=4) | |
ggsave(plot=p,file="Flight_Delays_By_Hour_DelayType.png",width=6,height=4) | |
####For every day of the year | |
plot_data = df %>% | |
group_by(month, datenum) %>% | |
dplyr::summarise(mu=median(delay,na.rm=TRUE), | |
se=sqrt(var(delay,na.rm=TRUE)/length(na.omit(delay))), | |
obs=length(na.omit(delay))) | |
p=ggplot(plot_data,aes(x=datenum,y=mu,min=mu-se,max=mu+se,group=month)) + | |
geom_line() + | |
geom_point() + | |
scale_y_continuous(breaks=c(0,10)) + | |
coord_cartesian(ylim=c(-4,16)) + | |
labs(x="Day of month",y="Median Departure Delay (Minutes)",title="Median Flight Delays by Departure Date") + | |
theme(legend.position="bottom") + | |
facet_grid(month ~.) + | |
theme_bw() | |
p | |
ggsave(plot=p,file="Flight_Delays_By_Departure_Date.pdf",width=8,height=6) | |
ggsave(plot=p,file="Flight_Delays_By_Departure_Date.png",width=8,height=6) | |
###JUST 10 busiest airports | |
#en.wikipedia.org/wiki/List_of_the_busiest_airports_in_the_United_States | |
#Busiest_US_airports_by_total_passenger_boardings | |
plot_data = df %>% | |
filter(airport %in% c( | |
"ATL", | |
"LAX", | |
"ORD", | |
"DFW", | |
"DEN", | |
"JFK", | |
"SFO", | |
"CLT", | |
"LAS", | |
"PHX" | |
)) %>% | |
group_by(departure_hour,airport) %>% | |
dplyr::summarise(mu=mean(delay,na.rm=TRUE), | |
se=sqrt(var(delay,na.rm=TRUE)/length(na.omit(delay))), | |
obs=length(na.omit(delay))) %>% | |
mutate(mu=ifelse((mu-0<.001),NA,mu), | |
airport=factor(airport,levels=c( | |
"ATL", | |
"LAX", | |
"ORD", | |
"DFW", | |
"DEN", | |
"JFK", | |
"SFO", | |
"CLT", | |
"LAS", | |
"PHX"))) | |
p=ggplot(subset(plot_data,as.numeric(airport) <=5), | |
aes(x=departure_hour,y=mu,min=mu-se,max=mu+se,group=airport,color=airport,shape=airport)) + | |
geom_line() + | |
geom_point() + | |
scale_x_continuous(breaks=seq(5,23)) + | |
labs(x="Hour of Day",y="Average Departure Delay (Minutes)",title="Top Five Most Popular Airports") + | |
theme(legend.position="bottom") + | |
scale_color_discrete(name="Airport") + | |
scale_shape_discrete(name="Airport") | |
p | |
ggsave(plot=p,file="Flight_Delays_By_Hour_Airport_Top5.png",width=6,height=4) | |
p=ggplot(subset(plot_data,as.numeric(airport) >5), | |
aes(x=departure_hour,y=mu,min=mu-se,max=mu+se,group=airport,color=airport,shape=airport)) + | |
geom_line() + | |
geom_point() + | |
scale_x_continuous(breaks=seq(5,23)) + | |
labs(x="Hour of Day",y="Average Departure Delay (Minutes)",title="Airports Six through Ten") + | |
theme(legend.position="bottom") + | |
scale_color_discrete(name="Airport") + | |
scale_shape_discrete(name="Airport") | |
p | |
ggsave(plot=p,file="Flight_Delays_By_Hour_Airport_6to10.png",width=6,height=4) | |
###Just the 95% and 75% quantiles | |
plot_data = df %>% | |
group_by(departure_hour) %>% | |
dplyr::summarise(Quantile_95=quantile(delay,.95,na.rm=TRUE), | |
Quantile_75=quantile(delay,.75,na.rm=TRUE), | |
obs=length(na.omit(delay))) | |
plot_data2 = plot_data %>% | |
gather(variable, value, Quantile_75:Quantile_95) %>% | |
mutate(variable=factor(variable,levels=c("Quantile_95","Quantile_75"))) | |
p=ggplot(plot_data2,aes(x=departure_hour,y=value,group=variable,color=variable)) + | |
geom_line() + | |
scale_x_continuous(breaks=seq(5,23)) + | |
labs(x="Hour of Day",y="Departure Delay (Minutes)",title="95th and 75th Percentiles of Departure Delays") + | |
scale_color_discrete(name="Quantile") + | |
theme(legend.position="bottom") | |
p | |
ggsave(plot=p,file="Flight_Delays_By_Hour_95th.pdf",width=6,height=4) | |
ggsave(plot=p,file="Flight_Delays_By_Hour_95th.png",width=6,height=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment