jm3 · May 12, 2012 22:11
diff --git a/jm3-redis-stats.R b/jm3-redis-stats.R
 # aggregate stats from redis from our tweet research

 # util: pretty-print bignums w/commas for readability
 pp <- function(x){
  format( x, big.mark=",", scientific=FALSE)
 }

 # open up access to redis data 
 library(rredis)
 redisConnect()

 # core sets + zsets
 sets  <- c('tweets:hashtags', 'tweets:links', 'tweets:mentions', 
           'user:is_public')
 zsets <- c('words', 'user:followers', 'user:num_tweets')

 # 8 language sets + 120 country sets
 langs  <- c("DE", "EN", "ES", "FA", "FR", "NL", "PT", "RU")
 countries  <- c(
  "AE", "AF", "AG", "AM", "AO", "AQ", "AR", "AT", "AU", "AZ", "BA", 
  "BB", "BD", "BE", "BH", "BN", "BR", "BS", "BW", "BY", "CA", "CH", 
  "CL", "CN", "CO", "CR", "CU", "CY", "DE", "DK", "DO", "DZ", "EC", 
  "EE", "EG", "ES", "ET", "FI", "FJ", "FK", "FR", "GB", "GE", "GH", 
  "GI", "GL", "GR", "GT", "GU", "HK", "HN", "HR", "HU", "ID", "IE", 
  "IL", "IN", "IR", "IT", "JM", "JO", "JP", "KE", "KH", "KP", "KR", 
  "KW", "LB", "LK", "LT", "LU", "LV", "MA", "MC", "MK", "MT", "MU", 
  "MW", "MX", "MY", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA", 
  "PE", "PH", "PK", "PL", "PT", "PY", "QA", "RO", "RS", "RU", "RW", 
  "SA", "SE", "SG", "SI", "SN", "SV", "TH", "TR", "TT", "TW", "TZ", 
  "UA", "UG", "US", "UY", "VA", "VE", "VI", "VN", "XK", "ZA", "ZW")

 # walk the list of key names and pretty-print stats for each set
 for (i in 1:length(sets)) {
  print( paste(sets[i], ":", pp( redisSCard(sets[i]))))
 }

 # ...and zset
 for (i in 1:length(zsets)) {
  print( paste(zsets[i], ":", pp( redisZCard(zsets[i]))))
 }

 # emit basic cardinality for all languages...
 lang_stats <- c(1:length(langs))
 for (i in 1:length(langs)) {
  key <- paste("user:lang:",langs[i],sep="")
  card <- redisSCard(key)
  lang_stats[i] <- card
  print( paste(key, pp(card)))
 }
 lang_stats <- data.frame(langs,lang_stats)
 names(lang_stats) <- c("tweet language","occurrences")

 # ...and countries
 country_stats <- c(1:length(countries))
 for (i in 1:length(countries)) {
  key <- paste("user:country:",countries[i],sep="")
  card <- redisSCard(key)
  country_stats[i] <- card
  print( paste(key, pp( card)))
 }
 country_stats <- data.frame(countries,country_stats)
 names(country_stats) <- c("tweet country", "occurrences")

 # clean up the workspace
 rm(i,card,key)

 # after the run, stats accrue in 2 data.frames: lang_stats + country_stats

 # "tweets:hashtags : 458,640
 # "tweets:links : 270,319
 # "tweets:mentions : 1,086,466
 # "user:is_public : 1,812,923
 #
 # "words : 503,999
 # "user:followers : 1,711,305
 # "user:num_tweets : 1,207,538
 #
 # "user:lang:DE 9,369
 # "user:lang:EN 1,622,940
 # "user:lang:ES 62,800
 # "user:lang:FA 932
 # "user:lang:FR 166,233
 # "user:lang:NL 3,361
 # "user:lang:PT 5,109
 # "user:lang:RU 124,741
	# aggregate stats from redis from our tweet research

	# util: pretty-print bignums w/commas for readability
	pp <- function(x){
	format( x, big.mark=",", scientific=FALSE)
	}

	# open up access to redis data
	library(rredis)
	redisConnect()

	# core sets + zsets
	sets <- c('tweets:hashtags', 'tweets:links', 'tweets:mentions',
	'user:is_public')
	zsets <- c('words', 'user:followers', 'user:num_tweets')

	# 8 language sets + 120 country sets
	langs <- c("DE", "EN", "ES", "FA", "FR", "NL", "PT", "RU")
	countries <- c(
	"AE", "AF", "AG", "AM", "AO", "AQ", "AR", "AT", "AU", "AZ", "BA",
	"BB", "BD", "BE", "BH", "BN", "BR", "BS", "BW", "BY", "CA", "CH",
	"CL", "CN", "CO", "CR", "CU", "CY", "DE", "DK", "DO", "DZ", "EC",
	"EE", "EG", "ES", "ET", "FI", "FJ", "FK", "FR", "GB", "GE", "GH",
	"GI", "GL", "GR", "GT", "GU", "HK", "HN", "HR", "HU", "ID", "IE",
	"IL", "IN", "IR", "IT", "JM", "JO", "JP", "KE", "KH", "KP", "KR",
	"KW", "LB", "LK", "LT", "LU", "LV", "MA", "MC", "MK", "MT", "MU",
	"MW", "MX", "MY", "NG", "NI", "NL", "NO", "NP", "NZ", "OM", "PA",
	"PE", "PH", "PK", "PL", "PT", "PY", "QA", "RO", "RS", "RU", "RW",
	"SA", "SE", "SG", "SI", "SN", "SV", "TH", "TR", "TT", "TW", "TZ",
	"UA", "UG", "US", "UY", "VA", "VE", "VI", "VN", "XK", "ZA", "ZW")

	# walk the list of key names and pretty-print stats for each set
	for (i in 1:length(sets)) {
	print( paste(sets[i], ":", pp( redisSCard(sets[i]))))
	}

	# ...and zset
	for (i in 1:length(zsets)) {
	print( paste(zsets[i], ":", pp( redisZCard(zsets[i]))))
	}

	# emit basic cardinality for all languages...
	lang_stats <- c(1:length(langs))
	for (i in 1:length(langs)) {
	key <- paste("user:lang:",langs[i],sep="")
	card <- redisSCard(key)
	lang_stats[i] <- card
	print( paste(key, pp(card)))
	}
	lang_stats <- data.frame(langs,lang_stats)
	names(lang_stats) <- c("tweet language","occurrences")

	# ...and countries
	country_stats <- c(1:length(countries))
	for (i in 1:length(countries)) {
	key <- paste("user:country:",countries[i],sep="")
	card <- redisSCard(key)
	country_stats[i] <- card
	print( paste(key, pp( card)))
	}
	country_stats <- data.frame(countries,country_stats)
	names(country_stats) <- c("tweet country", "occurrences")

	# clean up the workspace
	rm(i,card,key)

	# after the run, stats accrue in 2 data.frames: lang_stats + country_stats

	# "tweets:hashtags : 458,640
	# "tweets:links : 270,319
	# "tweets:mentions : 1,086,466
	# "user:is_public : 1,812,923
	#
	# "words : 503,999
	# "user:followers : 1,711,305
	# "user:num_tweets : 1,207,538
	#
	# "user:lang:DE 9,369
	# "user:lang:EN 1,622,940
	# "user:lang:ES 62,800
	# "user:lang:FA 932
	# "user:lang:FR 166,233
	# "user:lang:NL 3,361
	# "user:lang:PT 5,109
	# "user:lang:RU 124,741