Created
August 6, 2014 11:49
-
-
Save milesgrimshaw/67d26ffa50a6f9c4f5ef to your computer and use it in GitHub Desktop.
R script to analyze personal Citibike data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(lubridate) | |
## Set the working directory | |
setwd("~/Dropbox (Personal)/Personal/Github/Citibike") | |
## Organize restaurant data | |
r <- read.csv('trips.csv', header=TRUE, as.is=TRUE) | |
head(r) | |
## Calculate minute amount for each trip | |
r$actual_mins <- r$actual_duration/60 | |
r$estimated_mins <- r$estimated_duration/60 | |
## Save a version with only trips less than 60 mins | |
## My trips over 60 mins are errors | |
r <- r[which(r$actual_mins<60),] | |
write.csv(r,file="miles_trips_edits.csv", row.names=FALSE) | |
## Calculate average difference in actual vs. trip | |
r$time_diff_perc <- (r$estimated_mins - r$actual_mins)/r$estimated_mins | |
## Calculate a mile amount from meters for each trip | |
r$miles <- r$estimated_distance/1600 | |
## Calculate a MPH amount for each trip | |
r$mph <- (r$miles/r$actual_mins)*60 | |
## Correcting for errors | |
r <- r[which(r$actual_mins < 100),] | |
r <- r[which(r$mph < 25),] | |
r <- r[which(r$mph > 0),] | |
## Summary Stats | |
sum(r$miles) | |
summary(r$miles) | |
summary(r$miles[which(r$miles>0)]) | |
sum(r$actual_mins) | |
sum(r$estimated_mins) | |
summary(r$actual_mins) | |
summary(r$time_diff_perc[which(is.finite(r$time_diff_perc))])*100 | |
summary(r$mph) | |
summary(r$mph[which(r$mph<20)]) | |
## Create time stamps | |
r$start_date <- as.POSIXct(r$start_time,origin="1970-01-01", format="%m/%d/%Y %I:%M:%S %p") | |
r$end_date <- as.POSIXct(r$end_time,origin="1970-01-01", format="%m/%d/%Y %I:%M:%S %p") | |
# End time in mins | |
r$end_date_mins <- ((hour(r$end_date)*60)+minute(r$end_date)) | |
r$start_date_mins <- ((hour(r$start_date)*60)+minute(r$start_date)) | |
## Create weekday morning and evening commute groups | |
weekday <- r[which(wday(r$start_date) > 1 & wday(r$start_date) < 7),] | |
morn_commute <- weekday[which(hour(weekday$start_date) > 6 & hour(weekday$start_date) < 11),] | |
morn_commute <- morn_commute[which(morn_commute$end_station == "Lafayette St & Jersey St"),] | |
weekday_evening <- weekday[which(hour(weekday$start_date) > 17 & hour(weekday$start_date) < 24),] | |
eve_commute <- weekday_evening[which(weekday_evening$start_station == "Lafayette St & Jersey St"),] | |
eve_commute_home <- weekday_evening[which(weekday_evening$end_station == "Greenwich Ave & Charles St"),] | |
## Summary speeds for morning vs evening commutes | |
summary(morn_commute$mph) | |
summary(weekday_evening$mph) | |
summary(eve_commute$mph) | |
## Morning work commmute end time in mins | |
summary(morn_commute$end_date_mins)/60 | |
length(which((morn_commute$end_date_mins/60)>9))/length(morn_commute$end_date_mins) | |
length(which((morn_commute$end_date_mins/60)<=8))/length(morn_commute$end_date_mins) | |
## Evening work commmute end time in mins | |
summary(eve_commute$start_date_mins)/60 | |
length(which((eve_commute$start_date_mins/60)>20))/length(eve_commute$start_date_mins) | |
summary(eve_commute_home$end_date_mins)/60 | |
length(which((eve_commute_home$end_date_mins/60)>22))/length(eve_commute_home$end_date_mins) | |
## Create weekday binary for graphic | |
r$weekday <- "Weekend" | |
r$weekday[which(wday(r$start_date) > 1 & wday(r$start_date) < 7)] <- "Workday" | |
## Weekday vs. weekend speed | |
summary(r$mph[which(r$weekday==1)]) | |
summary(r$mph[which(r$weekday==0)]) | |
## Weekday vs. weekend distance | |
summary(r$miles[which(r$weekday==1)]) | |
summary(r$miles[which(r$weekday==0)]) | |
## Plot of speed with weekday vs weekend colored | |
r$weekday <- factor(r$weekday) | |
pdf(file="Plot_Time_Speed_Weekday.pdf",width=11,height=8.5) | |
ggplot(r, aes(x=hour(r$start_date), y=r$mph)) + geom_point(aes(colour = weekday), position = "jitter") + xlab("Hour of the Day") + | |
ylab("MPH") + ggtitle("Trip Speed by Time of Day") | |
dev.off() | |
## Plot distribution of trip distances | |
ggplot(r, aes(x=miles)) + geom_histogram(binwidth=0.1, fill="#0000CC") + xlab("Miles") + ylab("Number of Trips") + | |
ggtitle("Distribution of Trip Lengths") | |
## Plot distribution of trip times | |
ggplot(r, aes(x=actual_mins)) + geom_histogram(binwidth=1) | |
## Plot distribution of trip distances | |
pdf(file="Distribution_Trip_Speeds.pdf",width=11,height=8.5) | |
ggplot(r, aes(x=mph)) + geom_histogram(binwidth=1, fill="#0000CC") + xlab("MPH") + ylab("Number of Trips") + | |
ggtitle("Distribution of Trip Speeds") | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment