straypacket · December 29, 2015 14:49
diff --git a/mongo.js b/mongo.js
 // Aggregate users and count location updates
 db.locations.aggregate([
    { $group: {_id: "$locatable_id", number: {$sum: 1} }},
    { $sort: { "number": -1}}
 ])

 // Get all updates from a user, ordered by time
 db.locations.find(
    { "locatable_id":  ObjectId("51ac7bccee4340adef00000d") },
    {'_id': 0, 'coordinates': 1, 'created_at': 1}
 ).sort({"created_at": 1})
diff --git a/mongo_raw_data.r b/mongo_raw_data.r
 library(rmongodb)
 mg2 <- mongo.create(host="192.168.13.141")

 # Simple distinct
 users <- mongo.distinct(mg2, "fern_osm.locations", "locatable_id")

 # Simple query
 buf <- mongo.bson.buffer.create()
 #mongo.bson.buffer.append.object(buf, "locatable_id", "5179075dee434003f8000003")
 query <- mongo.bson.from.buffer(buf)

 # Make query
 count <- mongo.count(mg2, "fern_osm.locations", query)
 cur <- mongo.find(mg2, "fern_osm.locations", query)

 # Grab results
 locatable_id <- vector("character", count)
 updated_at <- vector("character", count)
 updated_at_hour <- vector("numeric", count)
 updated_at_dayofweek <- vector("numeric", count)
 coordinates_lon <- vector("numeric", count)
 coordinates_lat <- vector("numeric", count)
 value_list <- NULL
 i <- 1
 while (mongo.cursor.next(cur)){
  value <- mongo.cursor.value(cur)
  value_list <- mongo.bson.to.list(value)
  
  locatable_id[i] <- as.character.mongo.oid(value_list['locatable_id']$locatable_id)
  updated_at[i]   <- value_list['updated_at']$updated_at
  updated_at_hour[i] <- as.POSIXlt(value_list['updated_at']$updated_at)$hour
  updated_at_dayofweek[i]<- as.POSIXlt(value_list['updated_at']$updated_at)$wday
  
  coordinates_lon[i]  <- value_list['coordinates']$coordinates[1]
  coordinates_lat[i]  <- value_list['coordinates']$coordinates[2]
  i <- i + 1
  #print(i)
 }

 df <- data.frame(locatable_id=locatable_id, updated_at=updated_at, updated_at_hour=updated_at_hour, updated_at_dayofweek=updated_at_dayofweek, coordinates_lon=coordinates_lon, coordinates_lat=coordinates_lat)
 #df <- as.data.frame(list(locatable_id=locatable_id, updated_at=updated_at, coordinates_lon=coordinates_lon, coordinates_lat=coordinates_lat))
 #summary(df)
  
 mongo.cursor.destroy(cur)
diff --git a/user_analysis.r b/user_analysis.r
 library(plyr)
 library('fpc')
 library("RCurl")
 library("rjson")
 library("gtools")

 # Crawling function
 crawler <- function(tree, name, d) {
  treeArray[tree$shortName] <<- name
  c <- length(tree$categories)
  if (c) {
    for (i in 1:c) {
      tree$categories[[i]]$shortName
      crawler(tree$categories[[i]], name)
    }
  }
 }

 tod <- array(c(8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,0,2,6,8),c(2,10))
 #tod <- array(c(8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24),c(2,8))
 #tod <- array(c(8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24),c(2,8))

 # Select user
 #Unknown #4: 51dcf3edee43408c250001e5(24198 points) NG
 #Unknown #3: 51ba886eee4340ece1000039 (23848 points) NOK
 #Fernando: 51b57fb8ee4340bec5000016 (17685 points) OK
 #Unknown #2: 528b209d9db037c57900043d (5565 points) NG
 #Unknown #1: 51f0c181ee4340139700002f (5003 points) NG
 dfu <- subset(df, locatable_id=="51b57fb8ee4340bec5000016")
 # Select weekdays
 dfuwd <- subset(dfu, updated_at_dayofweek > 0 & updated_at_dayofweek < 6)
 # Select weekends
 dfuwe <- subset(dfu, updated_at_dayofweek == 0 | updated_at_dayofweek == 6)
 # Sort results
 dfuo <- arrange(dfuwd,desc(updated_at))
 #dfuo <- arrange(dfuwe,desc(updated_at))
 final <- {}

 for (t in 1:(length(tod)/2)) {
  x <- {}
  # Create dataset for given time range
  sequence <- seq(tod[1,t],tod[2,t]-1)
  for ( h in sequence ) {
    aux <- subset(dfuo, updated_at_hour == h )
    x <- rbind(x, aux)
  }
  
  print("==========================================")
  print(sequence)
  print(nrow(x))
  
  # Now, we only keep the last rows of a string of locations
  # i.e. the rows whose next movement is after 1 hour
  if (nrow(x) > 0){
    for (r in 2:nrow(x)-1) {
      a <- x[r,]
      b <- x[r+1,]
      tdiff <- as.numeric(apply(a[c('updated_at')],1,function(r) paste(r))) - as.numeric(apply(b[c('updated_at')],1,function(r) paste(r)))
      
      if (tdiff > 600){
        #print(tdiff)
        #print(a)
        final <- rbind(final,x[r,])
      }
    }
    
    if (is.null(final)) {
      final <- rbind(final,x[r+1,])
    }
  }
  print(final)
 }

 # Build Foursquare tree
 FSCats <- getURL("https://api.foursquare.com/v2/venues/categories?oauth_token=ADB02WREAK4W4R5BDYBVEXHWB14VZM4TQOIWZCYAD1GY22EK&v=20120410")
 FSTree <- fromJSON(FSCats, method = "C")
 treeArray <- {}
 for (sc in 1:length(FSTree$response$categories)) {
  crawler(FSTree$response$categories[[sc]], FSTree$response$categories[[sc]]$shortName)
 }

 # Clustering
 par(mfrow=c(5,2))
 radius = 500
 timing <- c('Week Day 8-10','Week Day 10-12','Week Day 12-14','Week Day 14-16','Week Day 16-18','Week Day 18-20','Week Day 20-22','Week Day 22-24','Week Day 0-2','Week Day 6-8')
 #timing <- c('Weekend 8-10','Weekend 10-12','Weekend 12-14','Weekend 14-16','Weekend 16-18','Weekend 18-20','Weekend 20-22','Weekend 22-24','Weekend 0-2','Weekend 6-8')
 for (t in 1:(length(tod)/2)) {
  # Create dataset for given time range
  sequence <- seq(tod[1,t],tod[2,t]-1)
  datax <- {}
  datay <- {}
  for ( h in sequence ) {
    aux <- subset(dfuo, updated_at_hour == h )
    #aux <- subset(final, updated_at_hour == h )
    if (nrow(aux) > 0) {
      for (r in seq(1:nrow(aux))){
        a <- aux[r,]
        datax <- append(datax, as.numeric(apply(a[c('coordinates_lon')],1,function(r) paste(r))))
        datay <- append(datay, as.numeric(apply(a[c('coordinates_lat')],1,function(r) paste(r))))
      }
    }
  }

  x <- {}
  x <- cbind(datax,datay)

  if(is.null(x)){
    plot(c(0,0), c(0,0), main=paste(timing[t]), xlim = c(139, 140), ylim = c(35, 36), ylab = "Latitude" , xlab = "Longitude")
  }
  else{
    if (nrow(x) > 0){
      d <- dbscan(x,eps=0.025, MinPts=20, scale=1, method="raw")
      plot(d, x, main=paste(timing[t]), xlim = c(139.6, 140), ylim = c(35.5, 36), ylab = "Latitude" , xlab = "Longitude")
      #plot(d, x, main=paste(timing[t],"density",0.025), xlim = c(139.7, 139.85), ylim = c(35.6, 35.8))
      #legend("bottomright", inset=.05, title="Clusters", c("1","2","3"), fill=tail(palette(), n=-1), horiz=TRUE)

      venue <- {}
      venue_count <- {}
      venue_distance <- {}
      finalvenue <- {}
      venue[""] = 0
      venue_count[""] = 0
      venue_distance[""] = 0

      # Context inference
      if (max(d$cluster)) {
        for (c in 1:max(d$cluster)) {
          # Cluster center
          clusCenter <- colMeans(x[d$cluster==c, ])

          # Fetch context: Foursquare
          Uctx <- getURL(paste("https://api.foursquare.com/v2/venues/search?ll=",clusCenter[2],",",clusCenter[1],"&oauth_token=ADB02WREAK4W4R5BDYBVEXHWB14VZM4TQOIWZCYAD1GY22EK&v=20120410&radius=",radius,"&intent=browse", sep=""))

          # Convert JSON to R-object
          Rctx <- fromJSON(Uctx,method = "C")

          # Parse context
          ctx <- {}
          if (length(Rctx$response$venues)) {
            for (v in 1:length(Rctx$response$venues)) {
              if(length(Rctx$response$venues[[v]]$categories)) {
                cat <- treeArray[Rctx$response$venues[[v]]$categories[[1]]$shortName]
                dist <- Rctx$response$venues[[v]]$location$distance
                if (dist < radius) {
                  ctx <- append( ctx, paste(dist,cat,Rctx$response$venues[[v]]$name) )
                  if(is.na(venue[cat])){
                    venue[cat] <- 1
                    venue_count[cat] <- Rctx$response$venues[[v]]$stats$checkinsCount
                    venue_distance[cat] <- dist
                  }
                  else {
                    venue[cat] <- venue[cat] + 1
                    venue_count[cat] <- venue[cat] + Rctx$response$venues[[v]]$stats$checkinsCount
                    venue_distance[cat] <- venue_distance[cat] + dist
                  }
                }
              }
            }
          }
        }
      }

      if (length(venue) > 1) {
        for (fv in 1:length(venue)){
          if(venue[fv] >= 1){
            finalvenue <- append(finalvenue, venue[fv])
          }
        }

        # Infer context
        print(paste(timing[t], " density: ", 0.025, " cluster: ", c, " of ", length(finalvenue), " with ", sum(d$cluster == c) ," elements @ ", clusCenter[2], "," ,clusCenter[1], sep=""))
        #sortedctx <- mixedsort(ctx)
        #print(sortedctx)
        sortedvenues <- mixedsort(finalvenue)
        #sortedvenuecounts <- mixedsort(venue_count)
        #for (sv in 1:length(sortedvenues)) {
          #mongo.insert(mongo, "test.people", list(name=uid, category=names(sortedvenues[sv]), qty=matrix(sortedvenues)[sv], timeslot=timingWE[i], clusterpoints=sum(d$cluster == c), loc=c(clusCenter[2], clusCenter[1])))
        #}
        #print(tail(sortedvenues,2))
        for (n in seq(3)){
          cat <- names(sortedvenues[length(sortedvenues)-n-1])
          print(paste("  - ", cat, ": ", sortedvenues[cat], "venues, ", venue_count[cat], " checkins, at an avg distance of ", venue_distance[cat]/venue_count[cat]))
        }
      }
    }
  }
 }
	// Aggregate users and count location updates
	db.locations.aggregate([
	{ $group: {_id: "$locatable_id", number: {$sum: 1} }},
	{ $sort: { "number": -1}}
	])

	// Get all updates from a user, ordered by time
	db.locations.find(
	{ "locatable_id": ObjectId("51ac7bccee4340adef00000d") },
	{'_id': 0, 'coordinates': 1, 'created_at': 1}
	).sort({"created_at": 1})
	library(rmongodb)
	mg2 <- mongo.create(host="192.168.13.141")

	# Simple distinct
	users <- mongo.distinct(mg2, "fern_osm.locations", "locatable_id")

	# Simple query
	buf <- mongo.bson.buffer.create()
	#mongo.bson.buffer.append.object(buf, "locatable_id", "5179075dee434003f8000003")
	query <- mongo.bson.from.buffer(buf)

	# Make query
	count <- mongo.count(mg2, "fern_osm.locations", query)
	cur <- mongo.find(mg2, "fern_osm.locations", query)

	# Grab results
	locatable_id <- vector("character", count)
	updated_at <- vector("character", count)
	updated_at_hour <- vector("numeric", count)
	updated_at_dayofweek <- vector("numeric", count)
	coordinates_lon <- vector("numeric", count)
	coordinates_lat <- vector("numeric", count)
	value_list <- NULL
	i <- 1
	while (mongo.cursor.next(cur)){
	value <- mongo.cursor.value(cur)
	value_list <- mongo.bson.to.list(value)

	locatable_id[i] <- as.character.mongo.oid(value_list['locatable_id']$locatable_id)
	updated_at[i] <- value_list['updated_at']$updated_at
	updated_at_hour[i] <- as.POSIXlt(value_list['updated_at']$updated_at)$hour
	updated_at_dayofweek[i]<- as.POSIXlt(value_list['updated_at']$updated_at)$wday

	coordinates_lon[i] <- value_list['coordinates']$coordinates[1]
	coordinates_lat[i] <- value_list['coordinates']$coordinates[2]
	i <- i + 1
	#print(i)
	}

	df <- data.frame(locatable_id=locatable_id, updated_at=updated_at, updated_at_hour=updated_at_hour, updated_at_dayofweek=updated_at_dayofweek, coordinates_lon=coordinates_lon, coordinates_lat=coordinates_lat)
	#df <- as.data.frame(list(locatable_id=locatable_id, updated_at=updated_at, coordinates_lon=coordinates_lon, coordinates_lat=coordinates_lat))
	#summary(df)

	mongo.cursor.destroy(cur)
	library(plyr)
	library('fpc')
	library("RCurl")
	library("rjson")
	library("gtools")

	# Crawling function
	crawler <- function(tree, name, d) {
	treeArray[tree$shortName] <<- name
	c <- length(tree$categories)
	if (c) {
	for (i in 1:c) {
	tree$categories[[i]]$shortName
	crawler(tree$categories[[i]], name)
	}
	}
	}

	tod <- array(c(8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24,0,2,6,8),c(2,10))
	#tod <- array(c(8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24),c(2,8))
	#tod <- array(c(8,10,10,12,12,14,14,16,16,18,18,20,20,22,22,24),c(2,8))

	# Select user
	#Unknown #4: 51dcf3edee43408c250001e5(24198 points) NG
	#Unknown #3: 51ba886eee4340ece1000039 (23848 points) NOK
	#Fernando: 51b57fb8ee4340bec5000016 (17685 points) OK
	#Unknown #2: 528b209d9db037c57900043d (5565 points) NG
	#Unknown #1: 51f0c181ee4340139700002f (5003 points) NG
	dfu <- subset(df, locatable_id=="51b57fb8ee4340bec5000016")
	# Select weekdays
	dfuwd <- subset(dfu, updated_at_dayofweek > 0 & updated_at_dayofweek < 6)
	# Select weekends
	dfuwe <- subset(dfu, updated_at_dayofweek == 0 \| updated_at_dayofweek == 6)
	# Sort results
	dfuo <- arrange(dfuwd,desc(updated_at))
	#dfuo <- arrange(dfuwe,desc(updated_at))
	final <- {}

	for (t in 1:(length(tod)/2)) {
	x <- {}
	# Create dataset for given time range
	sequence <- seq(tod[1,t],tod[2,t]-1)
	for ( h in sequence ) {
	aux <- subset(dfuo, updated_at_hour == h )
	x <- rbind(x, aux)
	}

	print("==========================================")
	print(sequence)
	print(nrow(x))

	# Now, we only keep the last rows of a string of locations
	# i.e. the rows whose next movement is after 1 hour
	if (nrow(x) > 0){
	for (r in 2:nrow(x)-1) {
	a <- x[r,]
	b <- x[r+1,]
	tdiff <- as.numeric(apply(a[c('updated_at')],1,function(r) paste(r))) - as.numeric(apply(b[c('updated_at')],1,function(r) paste(r)))

	if (tdiff > 600){
	#print(tdiff)
	#print(a)
	final <- rbind(final,x[r,])
	}
	}

	if (is.null(final)) {
	final <- rbind(final,x[r+1,])
	}
	}
	print(final)
	}

	# Build Foursquare tree
	FSCats <- getURL("https://api.foursquare.com/v2/venues/categories?oauth_token=ADB02WREAK4W4R5BDYBVEXHWB14VZM4TQOIWZCYAD1GY22EK&v=20120410")
	FSTree <- fromJSON(FSCats, method = "C")
	treeArray <- {}
	for (sc in 1:length(FSTree$response$categories)) {
	crawler(FSTree$response$categories[[sc]], FSTree$response$categories[[sc]]$shortName)
	}

	# Clustering
	par(mfrow=c(5,2))
	radius = 500
	timing <- c('Week Day 8-10','Week Day 10-12','Week Day 12-14','Week Day 14-16','Week Day 16-18','Week Day 18-20','Week Day 20-22','Week Day 22-24','Week Day 0-2','Week Day 6-8')
	#timing <- c('Weekend 8-10','Weekend 10-12','Weekend 12-14','Weekend 14-16','Weekend 16-18','Weekend 18-20','Weekend 20-22','Weekend 22-24','Weekend 0-2','Weekend 6-8')
	for (t in 1:(length(tod)/2)) {
	# Create dataset for given time range
	sequence <- seq(tod[1,t],tod[2,t]-1)
	datax <- {}
	datay <- {}
	for ( h in sequence ) {
	aux <- subset(dfuo, updated_at_hour == h )
	#aux <- subset(final, updated_at_hour == h )
	if (nrow(aux) > 0) {
	for (r in seq(1:nrow(aux))){
	a <- aux[r,]
	datax <- append(datax, as.numeric(apply(a[c('coordinates_lon')],1,function(r) paste(r))))
	datay <- append(datay, as.numeric(apply(a[c('coordinates_lat')],1,function(r) paste(r))))
	}
	}
	}

	x <- {}
	x <- cbind(datax,datay)

	if(is.null(x)){
	plot(c(0,0), c(0,0), main=paste(timing[t]), xlim = c(139, 140), ylim = c(35, 36), ylab = "Latitude" , xlab = "Longitude")
	}
	else{
	if (nrow(x) > 0){
	d <- dbscan(x,eps=0.025, MinPts=20, scale=1, method="raw")
	plot(d, x, main=paste(timing[t]), xlim = c(139.6, 140), ylim = c(35.5, 36), ylab = "Latitude" , xlab = "Longitude")
	#plot(d, x, main=paste(timing[t],"density",0.025), xlim = c(139.7, 139.85), ylim = c(35.6, 35.8))
	#legend("bottomright", inset=.05, title="Clusters", c("1","2","3"), fill=tail(palette(), n=-1), horiz=TRUE)

	venue <- {}
	venue_count <- {}
	venue_distance <- {}
	finalvenue <- {}
	venue[""] = 0
	venue_count[""] = 0
	venue_distance[""] = 0

	# Context inference
	if (max(d$cluster)) {
	for (c in 1:max(d$cluster)) {
	# Cluster center
	clusCenter <- colMeans(x[d$cluster==c, ])

	# Fetch context: Foursquare
	Uctx <- getURL(paste("https://api.foursquare.com/v2/venues/search?ll=",clusCenter[2],",",clusCenter[1],"&oauth_token=ADB02WREAK4W4R5BDYBVEXHWB14VZM4TQOIWZCYAD1GY22EK&v=20120410&radius=",radius,"&intent=browse", sep=""))

	# Convert JSON to R-object
	Rctx <- fromJSON(Uctx,method = "C")

	# Parse context
	ctx <- {}
	if (length(Rctx$response$venues)) {
	for (v in 1:length(Rctx$response$venues)) {
	if(length(Rctx$response$venues[[v]]$categories)) {
	cat <- treeArray[Rctx$response$venues[[v]]$categories[[1]]$shortName]
	dist <- Rctx$response$venues[[v]]$location$distance
	if (dist < radius) {
	ctx <- append( ctx, paste(dist,cat,Rctx$response$venues[[v]]$name) )
	if(is.na(venue[cat])){
	venue[cat] <- 1
	venue_count[cat] <- Rctx$response$venues[[v]]$stats$checkinsCount
	venue_distance[cat] <- dist
	}
	else {
	venue[cat] <- venue[cat] + 1
	venue_count[cat] <- venue[cat] + Rctx$response$venues[[v]]$stats$checkinsCount
	venue_distance[cat] <- venue_distance[cat] + dist
	}
	}
	}
	}
	}
	}
	}

	if (length(venue) > 1) {
	for (fv in 1:length(venue)){
	if(venue[fv] >= 1){
	finalvenue <- append(finalvenue, venue[fv])
	}
	}

	# Infer context
	print(paste(timing[t], " density: ", 0.025, " cluster: ", c, " of ", length(finalvenue), " with ", sum(d$cluster == c) ," elements @ ", clusCenter[2], "," ,clusCenter[1], sep=""))
	#sortedctx <- mixedsort(ctx)
	#print(sortedctx)
	sortedvenues <- mixedsort(finalvenue)
	#sortedvenuecounts <- mixedsort(venue_count)
	#for (sv in 1:length(sortedvenues)) {
	#mongo.insert(mongo, "test.people", list(name=uid, category=names(sortedvenues[sv]), qty=matrix(sortedvenues)[sv], timeslot=timingWE[i], clusterpoints=sum(d$cluster == c), loc=c(clusCenter[2], clusCenter[1])))
	#}
	#print(tail(sortedvenues,2))
	for (n in seq(3)){
	cat <- names(sortedvenues[length(sortedvenues)-n-1])
	print(paste(" - ", cat, ": ", sortedvenues[cat], "venues, ", venue_count[cat], " checkins, at an avg distance of ", venue_distance[cat]/venue_count[cat]))
	}
	}
	}
	}
	}