Created
November 22, 2013 05:47
-
-
Save bsmithgall/7595417 to your computer and use it in GitHub Desktop.
Original version of top_stations.py from github.com/bsmithgall/citibike
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library('RCurl') | |
library('RJSONIO') | |
library('igraph') | |
stations.url <- getURL('http://citibikenyc.com/stations/json') | |
dist.url <- getURL('appservices.citibikenyc.com/data2/stations.php') | |
# prep stations for kmeans clustering | |
stations.prep <- function(url) { | |
stations.json2 <- fromJSON(url, method='C') | |
stations.list2 <- as.data.frame(matrix(unlist( | |
lapply(stations.json2$stationBeanList, function(x){ | |
cbind(x$stationName, x$id, x$availableDocks, x$totalDocks, x$latitude, x$longitude) | |
})), nrow=length(stations.json2$stationBeanList), byrow=T) | |
) | |
# fix up some problems with factors | |
stations2 <- as.data.frame(cbind( | |
as.numeric(as.character(stations.list2$V2)), | |
as.numeric(as.character(stations.list2$V3)), | |
as.numeric(as.character(stations.list2$V4)), | |
as.numeric(as.character(stations.list2$V5)), | |
as.numeric(as.character(stations.list2$V6)) | |
)) | |
stations2$name<-as.character(stations.list2$V1) | |
names(stations2) <- c('id','available','total','lat','long', 'name') | |
stations.tocluster2 <- stations2[( | |
stations2$total-stations2$available)/stations2$total < .2,] | |
return(stations.tocluster2) | |
} | |
# cluster the stations | |
stations.kmeans <- function(stations.df) { | |
clusterables <- data.frame(stations.df$lat,stations.df$long) | |
row.names(clusterables) <- stations.df$id | |
clustered <- kmeans(clusterables, 4) | |
stations.cluster2 <- data.frame(stations.df, clustered$cluster) | |
return(stations.cluster2) | |
} | |
# prep functions for igraph analysis | |
degrees.to.radians<-function(degrees) { | |
return(degrees*pi/180) | |
} | |
haversine <- function(long1, lat1, long2, lat2) { | |
R <- 6371 # Earth mean radius [km] | |
delta.long <- (long2 - long1) | |
delta.lat <- (lat2 - lat1) | |
a <- sin(delta.lat/2)^2 + cos(lat1) * cos(lat2) * sin(delta.long/2)^2 | |
c <- 2 * asin(min(1,sqrt(a))) | |
d = R * c | |
return(d) # Distance in km | |
} | |
haversine.distance <- function(long1, lat1, long2, lat2) { | |
long1 <- degrees.to.radians(long1) | |
lat1 <- degrees.to.radians(lat1) | |
long2 <- degrees.to.radians(long2) | |
lat2 <- degrees.to.radians(lat2) | |
return(haversine(long1, lat1, long2, lat2)) | |
} | |
# function to generate graph data, a df of columns node1, node2, distance | |
graph.explode <- function(clustered.df) { | |
y <- data.frame( | |
t(apply( | |
combn(paste(clustered.df$id, clustered.df$lat, clustered.df$lon, sep=","), 2), 2, | |
function(i){ | |
i.names <- unlist(strsplit(as.character(i), split='\\,')) | |
latslongs <- c(as.numeric(i.names[2]),as.numeric(i.names[3]), | |
as.numeric(i.names[5]),as.numeric(i.names[6])) | |
return(c(i.names[1],i.names[4], | |
haversine.distance(latslongs[1],latslongs[2], | |
latslongs[3],latslongs[4]))) | |
}))) | |
# cast the weight to numeric | |
y[,3] <- as.numeric(as.character(y[,3])) | |
# to get a better graph, we are going to dump connections with longer | |
# distances than half a mile (~.8 km) | |
y <- y[y[,3] < .8,] | |
return(y) | |
} | |
# now that we are ready, dump everything into an igraph, | |
# extract the degree centrality, and return it out as a data.frame | |
get.graph.data <- function(single.cluster) { | |
cluster.graph2 <- graph.explode(single.cluster) | |
g2 <- graph.data.frame(cluster.graph2) | |
V(g2)$degcent <- centralization.degree(g2)$res | |
q2 <- as.data.frame(as.numeric(as.matrix(V(g2)$name))) | |
q2$v2<-as.matrix(V(g2)$degcent) | |
names(q2) <- c('id','degcent') | |
q2<-q2[order(q2$id, decreasing=TRUE),] | |
output <- merge(q2, single.cluster, by="id") | |
return(output) | |
} | |
# now we have to parse out closest nodes (by dist) to each highly | |
# central node -- first we have to get the data about distances | |
get.dists <- function(url) { | |
dist.json <- fromJSON(url, method='C') | |
dist.list <- as.data.frame(matrix(unlist( | |
lapply(dist.json$results, function(i) { | |
unlist(c(i$id,lapply(i$nearbyStations, function(j){unlist(cbind(j[1]))}))) | |
})), nrow=length(dist.json$results), byrow=TRUE)) | |
names(dist.list) <- c('id','close.one','close.two', | |
'close.three','close.four','close.five') | |
return(dist.list) | |
} | |
# now we have the graph data and the distances, so we have to | |
# combine these two together to return a list of recommended | |
# stations to repopulate | |
make.recs <- function(graph.data, dists) { | |
i <- 1 | |
close <- data.frame(check = numeric(0)) | |
results <- data.frame(id = numeric(0), name = character(0), | |
available = numeric(0), total = numeric(0), | |
stringsAsFactors=FALSE) | |
comb <- merge(graph.data, dists, by="id") | |
comb <- comb[order(comb$degcent, decreasing = T),] | |
# oh no a loop! kill it with fire! | |
# but seriously there's only like 300 total rows so i'm not | |
# going to sweat this one too much. maybe it can be refactored | |
# later | |
while(nrow(results) < 4) { | |
j <- comb[i,] | |
close[nrow(close) + 1,] <- c(j$close.one) | |
close[nrow(close) + 1,] <- c(j$close.two) | |
close[nrow(close) + 1,] <- c(j$close.three) | |
close[nrow(close) + 1,] <- c(j$close.four) | |
close[nrow(close) + 1,] <- c(j$close.five) | |
if(j$id %in% close$check) { NA } | |
else { results[nrow(results) + 1,] <- c(j$id, j$name, j$available, j$total) } | |
i <- i+1 | |
} | |
return(results) | |
} | |
main <- function() { | |
stations.tocluster <- stations.prep(stations.url) | |
stations.cluster <- stations.kmeans(stations.tocluster) | |
dists <- get.dists(dist.url) | |
output <- data.frame(id = numeric(0), name = character(0), | |
available = numeric(0), total = numeric(0), | |
stringsAsFactors=FALSE) | |
for(i in 1:4) { | |
j <- stations.cluster[stations.cluster$clustered.cluster==i,] | |
recs <- make.recs(get.graph.data(j), dists) | |
output <- rbind(output,recs) | |
} | |
return(output) | |
} | |
c <- main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment