Skip to content

Instantly share code, notes, and snippets.

@straypacket
Last active December 29, 2015 14:49
Show Gist options
  • Save straypacket/7686801 to your computer and use it in GitHub Desktop.
Save straypacket/7686801 to your computer and use it in GitHub Desktop.
Journal: Context for MongoDB + geo alerts
// Aggregate users and count location updates
db.locations.aggregate([
{ $group: {_id: "$locatable_id", number: {$sum: 1} }},
{ $sort: { "number": -1}}
])
// Get all updates from a user, ordered by time
db.locations.find(
{ "locatable_id": ObjectId("51ac7bccee4340adef00000d") },
{'_id': 0, 'coordinates': 1, 'created_at': 1}
).sort({"created_at": 1})
library(rmongodb)
mg2 <- mongo.create(host="192.168.13.141")
# Simple distinct
users <- mongo.distinct(mg2, "fern_osm.locations", "locatable_id")
# Simple query
buf <- mongo.bson.buffer.create()
#mongo.bson.buffer.append.object(buf, "locatable_id", "5179075dee434003f8000003")
query <- mongo.bson.from.buffer(buf)
# Make query
count <- mongo.count(mg2, "fern_osm.locations", query)
cur <- mongo.find(mg2, "fern_osm.locations", query)
# Grab results
locatable_id <- vector("character", count)
updated_at <- vector("character", count)
updated_at_hour <- vector("numeric", count)
updated_at_dayofweek <- vector("numeric", count)
coordinates_lon <- vector("numeric", count)
coordinates_lat <- vector("numeric", count)
value_list <- NULL
i <- 1
while (mongo.cursor.next(cur)){
value <- mongo.cursor.value(cur)
value_list <- mongo.bson.to.list(value)
locatable_id[i] <- as.character.mongo.oid(value_list['locatable_id']$locatable_id)
updated_at[i] <- value_list['updated_at']$updated_at
updated_at_hour[i] <- as.POSIXlt(value_list['updated_at']$updated_at)$hour
updated_at_dayofweek[i]<- as.POSIXlt(value_list['updated_at']$updated_at)$wday
coordinates_lon[i] <- value_list['coordinates']$coordinates[1]
coordinates_lat[i] <- value_list['coordinates']$coordinates[2]
i <- i + 1
#print(i)
}
df <- data.frame(locatable_id=locatable_id, updated_at=updated_at, updated_at_hour=updated_at_hour, updated_at_dayofweek=updated_at_dayofweek, coordinates_lon=coordinates_lon, coordinates_lat=coordinates_lat)
#df <- as.data.frame(list(locatable_id=locatable_id, updated_at=updated_at, coordinates_lon=coordinates_lon, coordinates_lat=coordinates_lat))
#summary(df)
mongo.cursor.destroy(cur)
library(plyr)
library('fpc')
library("RCurl")
library("rjson")
library("gtools")
# Crawling function
crawler <- function(tree, name, d) {
treeArray[tree$shortName] <<- name
c <- length(tree$categories)
if (c) {
for (i in 1:c) {
tree$categories[[i]]$shortName
crawler(tree$categories[[i]], name)
}
}
}
tod <- array(c(8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,0,2,6,8),c(2,10))
#tod <- array(c(8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24),c(2,8))
#tod <- array(c(8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24),c(2,8))
# Select user
#Unknown #4: 51dcf3edee43408c250001e5(24198 points) NG
#Unknown #3: 51ba886eee4340ece1000039 (23848 points) NOK
#Fernando: 51b57fb8ee4340bec5000016 (17685 points) OK
#Unknown #2: 528b209d9db037c57900043d (5565 points) NG
#Unknown #1: 51f0c181ee4340139700002f (5003 points) NG
dfu <- subset(df, locatable_id=="51b57fb8ee4340bec5000016")
# Select weekdays
dfuwd <- subset(dfu, updated_at_dayofweek > 0 & updated_at_dayofweek < 6)
# Select weekends
dfuwe <- subset(dfu, updated_at_dayofweek == 0 | updated_at_dayofweek == 6)
# Sort results
dfuo <- arrange(dfuwd,desc(updated_at))
#dfuo <- arrange(dfuwe,desc(updated_at))
final <- {}
for (t in 1:(length(tod)/2)) {
x <- {}
# Create dataset for given time range
sequence <- seq(tod[1,t],tod[2,t]-1)
for ( h in sequence ) {
aux <- subset(dfuo, updated_at_hour == h )
x <- rbind(x, aux)
}
print("==========================================")
print(sequence)
print(nrow(x))
# Now, we only keep the last rows of a string of locations
# i.e. the rows whose next movement is after 1 hour
if (nrow(x) > 0){
for (r in 2:nrow(x)-1) {
a <- x[r,]
b <- x[r+1,]
tdiff <- as.numeric(apply(a[c('updated_at')],1,function(r) paste(r))) - as.numeric(apply(b[c('updated_at')],1,function(r) paste(r)))
if (tdiff > 600){
#print(tdiff)
#print(a)
final <- rbind(final,x[r,])
}
}
if (is.null(final)) {
final <- rbind(final,x[r+1,])
}
}
print(final)
}
# Build Foursquare tree
FSCats <- getURL("https://api.foursquare.com/v2/venues/categories?oauth_token=ADB02WREAK4W4R5BDYBVEXHWB14VZM4TQOIWZCYAD1GY22EK&v=20120410")
FSTree <- fromJSON(FSCats, method = "C")
treeArray <- {}
for (sc in 1:length(FSTree$response$categories)) {
crawler(FSTree$response$categories[[sc]], FSTree$response$categories[[sc]]$shortName)
}
# Clustering
par(mfrow=c(5,2))
radius = 500
timing <- c('Week Day 8-10','Week Day 10-12','Week Day 12-14','Week Day 14-16','Week Day 16-18','Week Day 18-20','Week Day 20-22','Week Day 22-24','Week Day 0-2','Week Day 6-8')
#timing <- c('Weekend 8-10','Weekend 10-12','Weekend 12-14','Weekend 14-16','Weekend 16-18','Weekend 18-20','Weekend 20-22','Weekend 22-24','Weekend 0-2','Weekend 6-8')
for (t in 1:(length(tod)/2)) {
# Create dataset for given time range
sequence <- seq(tod[1,t],tod[2,t]-1)
datax <- {}
datay <- {}
for ( h in sequence ) {
aux <- subset(dfuo, updated_at_hour == h )
#aux <- subset(final, updated_at_hour == h )
if (nrow(aux) > 0) {
for (r in seq(1:nrow(aux))){
a <- aux[r,]
datax <- append(datax, as.numeric(apply(a[c('coordinates_lon')],1,function(r) paste(r))))
datay <- append(datay, as.numeric(apply(a[c('coordinates_lat')],1,function(r) paste(r))))
}
}
}
x <- {}
x <- cbind(datax,datay)
if(is.null(x)){
plot(c(0,0), c(0,0), main=paste(timing[t]), xlim = c(139, 140), ylim = c(35, 36), ylab = "Latitude" , xlab = "Longitude")
}
else{
if (nrow(x) > 0){
d <- dbscan(x,eps=0.025, MinPts=20, scale=1, method="raw")
plot(d, x, main=paste(timing[t]), xlim = c(139.6, 140), ylim = c(35.5, 36), ylab = "Latitude" , xlab = "Longitude")
#plot(d, x, main=paste(timing[t],"density",0.025), xlim = c(139.7, 139.85), ylim = c(35.6, 35.8))
#legend("bottomright", inset=.05, title="Clusters", c("1","2","3"), fill=tail(palette(), n=-1), horiz=TRUE)
venue <- {}
venue_count <- {}
venue_distance <- {}
finalvenue <- {}
venue[""] = 0
venue_count[""] = 0
venue_distance[""] = 0
# Context inference
if (max(d$cluster)) {
for (c in 1:max(d$cluster)) {
# Cluster center
clusCenter <- colMeans(x[d$cluster==c, ])
# Fetch context: Foursquare
Uctx <- getURL(paste("https://api.foursquare.com/v2/venues/search?ll=",clusCenter[2],",",clusCenter[1],"&oauth_token=ADB02WREAK4W4R5BDYBVEXHWB14VZM4TQOIWZCYAD1GY22EK&v=20120410&radius=",radius,"&intent=browse", sep=""))
# Convert JSON to R-object
Rctx <- fromJSON(Uctx,method = "C")
# Parse context
ctx <- {}
if (length(Rctx$response$venues)) {
for (v in 1:length(Rctx$response$venues)) {
if(length(Rctx$response$venues[[v]]$categories)) {
cat <- treeArray[Rctx$response$venues[[v]]$categories[[1]]$shortName]
dist <- Rctx$response$venues[[v]]$location$distance
if (dist < radius) {
ctx <- append( ctx, paste(dist,cat,Rctx$response$venues[[v]]$name) )
if(is.na(venue[cat])){
venue[cat] <- 1
venue_count[cat] <- Rctx$response$venues[[v]]$stats$checkinsCount
venue_distance[cat] <- dist
}
else {
venue[cat] <- venue[cat] + 1
venue_count[cat] <- venue[cat] + Rctx$response$venues[[v]]$stats$checkinsCount
venue_distance[cat] <- venue_distance[cat] + dist
}
}
}
}
}
}
}
if (length(venue) > 1) {
for (fv in 1:length(venue)){
if(venue[fv] >= 1){
finalvenue <- append(finalvenue, venue[fv])
}
}
# Infer context
print(paste(timing[t], " density: ", 0.025, " cluster: ", c, " of ", length(finalvenue), " with ", sum(d$cluster == c) ," elements @ ", clusCenter[2], "," ,clusCenter[1], sep=""))
#sortedctx <- mixedsort(ctx)
#print(sortedctx)
sortedvenues <- mixedsort(finalvenue)
#sortedvenuecounts <- mixedsort(venue_count)
#for (sv in 1:length(sortedvenues)) {
#mongo.insert(mongo, "test.people", list(name=uid, category=names(sortedvenues[sv]), qty=matrix(sortedvenues)[sv], timeslot=timingWE[i], clusterpoints=sum(d$cluster == c), loc=c(clusCenter[2], clusCenter[1])))
#}
#print(tail(sortedvenues,2))
for (n in seq(3)){
cat <- names(sortedvenues[length(sortedvenues)-n-1])
print(paste(" - ", cat, ": ", sortedvenues[cat], "venues, ", venue_count[cat], " checkins, at an avg distance of ", venue_distance[cat]/venue_count[cat]))
}
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment