-
-
Save fototo/873f54360aeb2193b293 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#data from http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time | |
#Ask for these fields | |
#"DAY_OF_WEEK" (IN UI DayOfWeek) | |
#"FL_DATE" (FlightDate) | |
#"CARRIER" (Carrier) | |
#"ORIGIN_CITY_MARKET_ID" (OriginCityMarketID) | |
#"ORIGIN" (Origin) | |
#"CRS_DEP_TIME" (CRSDepTime) | |
#"DEP_DELAY" (DepDelay) | |
#"ARR_DELAY" (ArrDelay) | |
#save files with names in format 01_2013.csv.gz, 02_2013.csv.gz, etc. | |
#save them in a subdirectory called "database" | |
library(plyr) | |
library(tidyr) | |
library(lubridate) | |
library(ggplot2) | |
library(dplyr) | |
#On first run, uncomment and do this. Afterwards, just load the .Rdata file | |
# paths=dir("database",pattern="_2013.csv",full.names=TRUE) | |
# names(paths)=basename(paths) | |
# #takes about 4 mins | |
# df=ldply(paths,read.csv) | |
# df$X=df$.id=NULL | |
# names(df)=c("day_of_week","date","carrier","origin_market_id","airport", | |
# "departure_hour","delay","arr_delay") | |
# | |
# save(df,file="2013flights.Rdata") | |
#takes only a few secs | |
load(file="2013flights.Rdata") | |
str(df) | |
df = df %>% | |
mutate( | |
day_of_week=factor(day_of_week,levels=c(1:7,9), | |
labels= | |
c("Monday", | |
"Tuesday", | |
"Wednesday", | |
"Thursday", | |
"Friday", | |
"Saturday", | |
"Sunday", | |
"Unknown" | |
)), | |
month=substr(date,6,7), | |
datenum=substr(date,9,10), | |
departure_hour = round(departure_hour/100,0), | |
delay=ifelse(delay<0,0,delay), | |
arr_delay=ifelse(arr_delay<0,0,arr_delay)) %>% | |
filter(departure_hour > 5 & departure_hour< 24) | |
###Arrival and departure delays as fun of departure time | |
plot_data = df %>% | |
gather(delay_type,newdelay,delay:arr_delay) %>% | |
mutate(delay_type = ifelse(delay_type=="delay","Departure Delay","Arrival Delay")) %>% | |
group_by(departure_hour,delay_type) %>% | |
dplyr::summarise(mu=mean(newdelay,na.rm=TRUE), | |
se=sqrt(var(newdelay,na.rm=TRUE)/length(na.omit(newdelay))), | |
obs=length(na.omit(newdelay))) | |
#MS: W00t! | |
p=ggplot(plot_data,aes(x=departure_hour,y=mu,min=mu-se,max=mu+se,group=delay_type,color=delay_type)) + | |
geom_line() + | |
geom_point() + | |
geom_errorbar(width=.33) + | |
scale_x_continuous(breaks=seq(6,23)) + | |
labs(x="Hour of Day",y="Average Delay",title="Flight Delays by Departure Time") + | |
theme(legend.position="bottom") + | |
scale_color_discrete(name="Delay Type") | |
p | |
ggsave(plot=p,file="Flight_Delays_By_Hour_DelayType.pdf",width=6,height=4) | |
ggsave(plot=p,file="Flight_Delays_By_Hour_DelayType.png",width=6,height=4) | |
####For every day of the year | |
plot_data = df %>% | |
group_by(month, datenum) %>% | |
dplyr::summarise(mu=median(delay,na.rm=TRUE), | |
se=sqrt(var(delay,na.rm=TRUE)/length(na.omit(delay))), | |
obs=length(na.omit(delay))) | |
p=ggplot(plot_data,aes(x=datenum,y=mu,min=mu-se,max=mu+se,group=month)) + | |
geom_line() + | |
geom_point() + | |
scale_y_continuous(breaks=c(0,10)) + | |
coord_cartesian(ylim=c(-4,16)) + | |
labs(x="Day of month",y="Median Departure Delay",title="Median Flight Delays by Departure Date") + | |
theme(legend.position="bottom") + | |
facet_grid(month ~.) + | |
theme_bw() | |
p | |
ggsave(plot=p,file="Flight_Delays_By_Departure_Date.pdf",width=8,height=6) | |
ggsave(plot=p,file="Flight_Delays_By_Departure_Date.png",width=8,height=6) | |
###JUST JFK AND LAX AND ORD AND DFW | |
plot_data = df %>% | |
filter(airport %in% c("JFK","LAX","DFW","ORD")) %>% | |
group_by(departure_hour,airport) %>% | |
dplyr::summarise(mu=mean(delay,na.rm=TRUE), | |
se=sqrt(var(delay,na.rm=TRUE)/length(na.omit(delay))), | |
obs=length(na.omit(delay))) | |
p=ggplot(plot_data,aes(x=departure_hour,y=mu,min=mu-se,max=mu+se,group=airport,color=airport)) + | |
geom_line() + | |
geom_point() + | |
geom_errorbar(width=.33) + | |
scale_x_continuous(breaks=seq(5,23)) + | |
labs(x="Hour of Day",y="Average Departure Delay",title="Flight Delays by Departure Time and Airport") + | |
theme(legend.position="bottom") + | |
scale_color_discrete(name="Delay Type") | |
p | |
ggsave(plot=p,file="Flight_Delays_By_Hour_Airport.pdf",width=6,height=4) | |
ggsave(plot=p,file="Flight_Delays_By_Hour_Airport.png",width=6,height=4) | |
###Just the 95% and 75% quantiles | |
plot_data = df %>% | |
group_by(departure_hour) %>% | |
dplyr::summarise(Quantile_95=quantile(delay,.95,na.rm=TRUE), | |
Quantile_75=quantile(delay,.75,na.rm=TRUE), | |
obs=length(na.omit(delay))) | |
plot_data2 = plot_data %>% | |
gather(variable, value, Quantile_75:Quantile_95) %>% | |
mutate(variable=factor(variable,levels=c("Quantile_95","Quantile_75"))) | |
p=ggplot(plot_data2,aes(x=departure_hour,y=value,group=variable,color=variable)) + | |
geom_line() + | |
scale_x_continuous(breaks=seq(5,23)) + | |
labs(x="Hour of Day",y="Departure Delay",title="95th and 75th Percentiles of Departure Delays") + | |
scale_color_discrete(name="Quantile") + | |
theme(legend.position="bottom") | |
p | |
ggsave(plot=p,file="Flight_Delays_By_Hour_95th.pdf",width=6,height=4) | |
ggsave(plot=p,file="Flight_Delays_By_Hour_95th.png",width=6,height=4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment