geoffjentry · January 25, 2014 17:33
diff --git a/gistfile1.txt b/gistfile1.txt
 load("code2013.rda") # 6028 tweets

 filtered_tweets = strip_retweets(code2013) # 5006 tweets
 statuses = sapply(filtered_tweets, function(x) x$getText())

 # Read in the TIOBE data
 tiobe = read.csv("tiobe.csv", stringsAsFactors=FALSE)
 tiobe_langs = tolower(tiobe[, "lang"])

 # Looking at the TIOBE listings and some of the tweet data, massage some of the entries
 # here. This won't be perfect but will help a little bit
 replace_statuses = function(statuses, was, is) {
    gsub(was, is, statuses, ignore.case=TRUE)
 }

 replacements = list(c("objective c", "objective-c"), c("visual basic", "visual-basic"),
    c("emacs lisp", "emacs-lisp"), c("object pascal", "delphi/object-pascal"),
    c("delphi", "delphi/object-pascal"), c("common lisp", "common-lisp"),
    c("elisp", "emacs-lisp"))

 for (pair in replacements) {
    statuses = replace_statuses(statuses, pair[1], pair[2])
 }

 tiobe_langs[7] = "visual-basic"
 tiobe_langs[11] = "visual-basic"
 tiobe_langs[20] = "delphi/object-pascal"
 tiobe_langs[46] = "emacs-lisp"
 tiobe_langs[41] = "common-lisp"

 tiobe$lang = tiobe_langs
 # we've got two visual-basic entries
 tiobe[7, "rating"] = tiobe[7, "rating"] + tiobe[11, "rating"]
 tiobe = tiobe[-11, ]

 # I want to convert this all to lowercase but there are 67 with weird encodings
 bad_statuses = numeric()
 lowercase_statuses = character()
 for (i in seq_along(statuses)) {
    tl = try(tolower(statuses[[i]]), silent=TRUE)
    if (inherits(tl, "try-error")) {
        bad_statuses = c(bad_statuses, i)
    } else {
        lowercase_statuses = c(lowercase_statuses, tl)
    }
 }

 if (length(bad_statuses) > 0) {
    filtered_tweets = filtered_tweets[-bad_statuses]
 }

 statuses = lowercase_statuses

 # tokenize each status. split on comma period or whitespace
 status_tokens = strsplit(statuses, ",|\\.|\\s+")

 matching_tokens = sapply(status_tokens, function(x) {
    x[which(x %in% tiobe_langs)]
 })

 # Now have the languages mentioned in #code2013 which are in TIOBE
 code2013_langs = unlist(matching_tokens)
 new_code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE))
 colnames(new_code2013_lang_table) = "Count"
 # Create a column describing the rough place of the code2013 langs
 new_code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5),
    rep("16-25", 10), rep("26-40", 15)), levels=c("1-5", "6-10", "11-15", "16-25", "26-40"))
 # Order by the TIOBE rankings
 new_code2013_lang_table$code2013_langs = ordered(rownames(new_code2013_lang_table),
    levels=rev(tiobe[, "lang"]))
 new_code2013_lang_table$code2013_rank = 1:nrow(new_code2013_lang_table)
 new_code2013_lang_table$tiobe_rank = match(new_code2013_lang_table$code2013_langs, tiobe[, "lang"])
 new_code2013_lang_table$orig_rank = code2013_lang_table[match(rownames(new_code2013_lang_table),
    rownames(code2013_lang_table)), "code2013_rank"]


 library(ggplot2)

 ## Compare new vs old
 png(file="update/new_vs_old.png", width=640, height=640)
 ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=orig_rank, color=code2013_tier)) +
    geom_text(aes(label=code2013_langs), size=3.5) +
    ylab("Updated #code2013 Rank") + xlab("Original #code2013 rank") +
    ggtitle("Updated vs New #code2013 Rankings")
 dev.off()

 ## Compare new to tiobe
 png(file="update/update_vs_tiobe.png", width=640, height=640)
 ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
    geom_text(aes(label=code2013_langs), size=3.5) +
    ylab("TIOBE Rank") + xlab("Updated #code2013 rank") +
    ggtitle("Updated #code2013 vs TIOBE rankings")
 dev.off()


 ## Compare only new to tiobe - yes, there's a lot of ugly reused code and
 ## overwriting of variables. I'm lazy, hungry and want lunch.
 code2013 = code2013[1:1404]
 filtered_tweets = strip_retweets(code2013) 
 statuses = sapply(filtered_tweets, function(x) x$getText())

 bad_statuses = numeric()
 lowercase_statuses = character()
 for (i in seq_along(statuses)) {
    tl = try(tolower(statuses[[i]]), silent=TRUE)
    if (inherits(tl, "try-error")) {
        bad_statuses = c(bad_statuses, i)
    } else {
        lowercase_statuses = c(lowercase_statuses, tl)
    }
 }

 if (length(bad_statuses) > 0) {
    filtered_tweets = filtered_tweets[-bad_statuses]
 }

 statuses = lowercase_statuses

 for (pair in replacements) {
    statuses = replace_statuses(statuses, pair[1], pair[2])
 }

 # tokenize each status. split on comma period or whitespace
 status_tokens = strsplit(statuses, ",|\\.|\\s+")

 matching_tokens = sapply(status_tokens, function(x) {
    x[which(x %in% tiobe_langs)]
 })

 code2013_langs = unlist(matching_tokens)
 code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE))
 colnames(code2013_lang_table) = "Count"
 # Create a column describing the rough place of the code2013 langs
 code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5),
    rep("16-25", 10), rep("26-35", 10)), levels=c("1-5", "6-10", "11-15", "16-25", "26-35"))
 # Order by the TIOBE rankings
 code2013_lang_table$code2013_langs = ordered(rownames(code2013_lang_table),
    levels=rev(tiobe[, "lang"]))
 code2013_lang_table$code2013_rank = 1:nrow(code2013_lang_table)
 code2013_lang_table$tiobe_rank = match(code2013_lang_table$code2013_langs, tiobe[, "lang"])

 png(file="updated_code2013_tiobe_scatter.png", width=640, height=640)
 ggplot(code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
    geom_text(aes(label=code2013_langs), size=3.5) +
    ylab("TIOBE Rank") + xlab("Updated #code2013 Rank") +
    ggtitle("Updated #code2013 vs TIOBE rankings")
 dev.off()
	load("code2013.rda") # 6028 tweets

	filtered_tweets = strip_retweets(code2013) # 5006 tweets
	statuses = sapply(filtered_tweets, function(x) x$getText())

	# Read in the TIOBE data
	tiobe = read.csv("tiobe.csv", stringsAsFactors=FALSE)
	tiobe_langs = tolower(tiobe[, "lang"])

	# Looking at the TIOBE listings and some of the tweet data, massage some of the entries
	# here. This won't be perfect but will help a little bit
	replace_statuses = function(statuses, was, is) {
	gsub(was, is, statuses, ignore.case=TRUE)
	}

	replacements = list(c("objective c", "objective-c"), c("visual basic", "visual-basic"),
	c("emacs lisp", "emacs-lisp"), c("object pascal", "delphi/object-pascal"),
	c("delphi", "delphi/object-pascal"), c("common lisp", "common-lisp"),
	c("elisp", "emacs-lisp"))

	for (pair in replacements) {
	statuses = replace_statuses(statuses, pair[1], pair[2])
	}

	tiobe_langs[7] = "visual-basic"
	tiobe_langs[11] = "visual-basic"
	tiobe_langs[20] = "delphi/object-pascal"
	tiobe_langs[46] = "emacs-lisp"
	tiobe_langs[41] = "common-lisp"

	tiobe$lang = tiobe_langs
	# we've got two visual-basic entries
	tiobe[7, "rating"] = tiobe[7, "rating"] + tiobe[11, "rating"]
	tiobe = tiobe[-11, ]

	# I want to convert this all to lowercase but there are 67 with weird encodings
	bad_statuses = numeric()
	lowercase_statuses = character()
	for (i in seq_along(statuses)) {
	tl = try(tolower(statuses[[i]]), silent=TRUE)
	if (inherits(tl, "try-error")) {
	bad_statuses = c(bad_statuses, i)
	} else {
	lowercase_statuses = c(lowercase_statuses, tl)
	}
	}

	if (length(bad_statuses) > 0) {
	filtered_tweets = filtered_tweets[-bad_statuses]
	}

	statuses = lowercase_statuses

	# tokenize each status. split on comma period or whitespace
	status_tokens = strsplit(statuses, ",\|\\.\|\\s+")

	matching_tokens = sapply(status_tokens, function(x) {
	x[which(x %in% tiobe_langs)]
	})

	# Now have the languages mentioned in #code2013 which are in TIOBE
	code2013_langs = unlist(matching_tokens)
	new_code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE))
	colnames(new_code2013_lang_table) = "Count"
	# Create a column describing the rough place of the code2013 langs
	new_code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5),
	rep("16-25", 10), rep("26-40", 15)), levels=c("1-5", "6-10", "11-15", "16-25", "26-40"))
	# Order by the TIOBE rankings
	new_code2013_lang_table$code2013_langs = ordered(rownames(new_code2013_lang_table),
	levels=rev(tiobe[, "lang"]))
	new_code2013_lang_table$code2013_rank = 1:nrow(new_code2013_lang_table)
	new_code2013_lang_table$tiobe_rank = match(new_code2013_lang_table$code2013_langs, tiobe[, "lang"])
	new_code2013_lang_table$orig_rank = code2013_lang_table[match(rownames(new_code2013_lang_table),
	rownames(code2013_lang_table)), "code2013_rank"]


	library(ggplot2)

	## Compare new vs old
	png(file="update/new_vs_old.png", width=640, height=640)
	ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=orig_rank, color=code2013_tier)) +
	geom_text(aes(label=code2013_langs), size=3.5) +
	ylab("Updated #code2013 Rank") + xlab("Original #code2013 rank") +
	ggtitle("Updated vs New #code2013 Rankings")
	dev.off()

	## Compare new to tiobe
	png(file="update/update_vs_tiobe.png", width=640, height=640)
	ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
	geom_text(aes(label=code2013_langs), size=3.5) +
	ylab("TIOBE Rank") + xlab("Updated #code2013 rank") +
	ggtitle("Updated #code2013 vs TIOBE rankings")
	dev.off()


	## Compare only new to tiobe - yes, there's a lot of ugly reused code and
	## overwriting of variables. I'm lazy, hungry and want lunch.
	code2013 = code2013[1:1404]
	filtered_tweets = strip_retweets(code2013)
	statuses = sapply(filtered_tweets, function(x) x$getText())

	bad_statuses = numeric()
	lowercase_statuses = character()
	for (i in seq_along(statuses)) {
	tl = try(tolower(statuses[[i]]), silent=TRUE)
	if (inherits(tl, "try-error")) {
	bad_statuses = c(bad_statuses, i)
	} else {
	lowercase_statuses = c(lowercase_statuses, tl)
	}
	}

	if (length(bad_statuses) > 0) {
	filtered_tweets = filtered_tweets[-bad_statuses]
	}

	statuses = lowercase_statuses

	for (pair in replacements) {
	statuses = replace_statuses(statuses, pair[1], pair[2])
	}

	# tokenize each status. split on comma period or whitespace
	status_tokens = strsplit(statuses, ",\|\\.\|\\s+")

	matching_tokens = sapply(status_tokens, function(x) {
	x[which(x %in% tiobe_langs)]
	})

	code2013_langs = unlist(matching_tokens)
	code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE))
	colnames(code2013_lang_table) = "Count"
	# Create a column describing the rough place of the code2013 langs
	code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5),
	rep("16-25", 10), rep("26-35", 10)), levels=c("1-5", "6-10", "11-15", "16-25", "26-35"))
	# Order by the TIOBE rankings
	code2013_lang_table$code2013_langs = ordered(rownames(code2013_lang_table),
	levels=rev(tiobe[, "lang"]))
	code2013_lang_table$code2013_rank = 1:nrow(code2013_lang_table)
	code2013_lang_table$tiobe_rank = match(code2013_lang_table$code2013_langs, tiobe[, "lang"])

	png(file="updated_code2013_tiobe_scatter.png", width=640, height=640)
	ggplot(code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) +
	geom_text(aes(label=code2013_langs), size=3.5) +
	ylab("TIOBE Rank") + xlab("Updated #code2013 Rank") +
	ggtitle("Updated #code2013 vs TIOBE rankings")
	dev.off()