Created
January 25, 2014 17:33
-
-
Save geoffjentry/8620150 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
load("code2013.rda") # 6028 tweets | |
filtered_tweets = strip_retweets(code2013) # 5006 tweets | |
statuses = sapply(filtered_tweets, function(x) x$getText()) | |
# Read in the TIOBE data | |
tiobe = read.csv("tiobe.csv", stringsAsFactors=FALSE) | |
tiobe_langs = tolower(tiobe[, "lang"]) | |
# Looking at the TIOBE listings and some of the tweet data, massage some of the entries | |
# here. This won't be perfect but will help a little bit | |
replace_statuses = function(statuses, was, is) { | |
gsub(was, is, statuses, ignore.case=TRUE) | |
} | |
replacements = list(c("objective c", "objective-c"), c("visual basic", "visual-basic"), | |
c("emacs lisp", "emacs-lisp"), c("object pascal", "delphi/object-pascal"), | |
c("delphi", "delphi/object-pascal"), c("common lisp", "common-lisp"), | |
c("elisp", "emacs-lisp")) | |
for (pair in replacements) { | |
statuses = replace_statuses(statuses, pair[1], pair[2]) | |
} | |
tiobe_langs[7] = "visual-basic" | |
tiobe_langs[11] = "visual-basic" | |
tiobe_langs[20] = "delphi/object-pascal" | |
tiobe_langs[46] = "emacs-lisp" | |
tiobe_langs[41] = "common-lisp" | |
tiobe$lang = tiobe_langs | |
# we've got two visual-basic entries | |
tiobe[7, "rating"] = tiobe[7, "rating"] + tiobe[11, "rating"] | |
tiobe = tiobe[-11, ] | |
# I want to convert this all to lowercase but there are 67 with weird encodings | |
bad_statuses = numeric() | |
lowercase_statuses = character() | |
for (i in seq_along(statuses)) { | |
tl = try(tolower(statuses[[i]]), silent=TRUE) | |
if (inherits(tl, "try-error")) { | |
bad_statuses = c(bad_statuses, i) | |
} else { | |
lowercase_statuses = c(lowercase_statuses, tl) | |
} | |
} | |
if (length(bad_statuses) > 0) { | |
filtered_tweets = filtered_tweets[-bad_statuses] | |
} | |
statuses = lowercase_statuses | |
# tokenize each status. split on comma period or whitespace | |
status_tokens = strsplit(statuses, ",|\\.|\\s+") | |
matching_tokens = sapply(status_tokens, function(x) { | |
x[which(x %in% tiobe_langs)] | |
}) | |
# Now have the languages mentioned in #code2013 which are in TIOBE | |
code2013_langs = unlist(matching_tokens) | |
new_code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE)) | |
colnames(new_code2013_lang_table) = "Count" | |
# Create a column describing the rough place of the code2013 langs | |
new_code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5), | |
rep("16-25", 10), rep("26-40", 15)), levels=c("1-5", "6-10", "11-15", "16-25", "26-40")) | |
# Order by the TIOBE rankings | |
new_code2013_lang_table$code2013_langs = ordered(rownames(new_code2013_lang_table), | |
levels=rev(tiobe[, "lang"])) | |
new_code2013_lang_table$code2013_rank = 1:nrow(new_code2013_lang_table) | |
new_code2013_lang_table$tiobe_rank = match(new_code2013_lang_table$code2013_langs, tiobe[, "lang"]) | |
new_code2013_lang_table$orig_rank = code2013_lang_table[match(rownames(new_code2013_lang_table), | |
rownames(code2013_lang_table)), "code2013_rank"] | |
library(ggplot2) | |
## Compare new vs old | |
png(file="update/new_vs_old.png", width=640, height=640) | |
ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=orig_rank, color=code2013_tier)) + | |
geom_text(aes(label=code2013_langs), size=3.5) + | |
ylab("Updated #code2013 Rank") + xlab("Original #code2013 rank") + | |
ggtitle("Updated vs New #code2013 Rankings") | |
dev.off() | |
## Compare new to tiobe | |
png(file="update/update_vs_tiobe.png", width=640, height=640) | |
ggplot(new_code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) + | |
geom_text(aes(label=code2013_langs), size=3.5) + | |
ylab("TIOBE Rank") + xlab("Updated #code2013 rank") + | |
ggtitle("Updated #code2013 vs TIOBE rankings") | |
dev.off() | |
## Compare only new to tiobe - yes, there's a lot of ugly reused code and | |
## overwriting of variables. I'm lazy, hungry and want lunch. | |
code2013 = code2013[1:1404] | |
filtered_tweets = strip_retweets(code2013) | |
statuses = sapply(filtered_tweets, function(x) x$getText()) | |
bad_statuses = numeric() | |
lowercase_statuses = character() | |
for (i in seq_along(statuses)) { | |
tl = try(tolower(statuses[[i]]), silent=TRUE) | |
if (inherits(tl, "try-error")) { | |
bad_statuses = c(bad_statuses, i) | |
} else { | |
lowercase_statuses = c(lowercase_statuses, tl) | |
} | |
} | |
if (length(bad_statuses) > 0) { | |
filtered_tweets = filtered_tweets[-bad_statuses] | |
} | |
statuses = lowercase_statuses | |
for (pair in replacements) { | |
statuses = replace_statuses(statuses, pair[1], pair[2]) | |
} | |
# tokenize each status. split on comma period or whitespace | |
status_tokens = strsplit(statuses, ",|\\.|\\s+") | |
matching_tokens = sapply(status_tokens, function(x) { | |
x[which(x %in% tiobe_langs)] | |
}) | |
code2013_langs = unlist(matching_tokens) | |
code2013_lang_table = as.data.frame(sort(table(code2013_langs), decreasing=TRUE)) | |
colnames(code2013_lang_table) = "Count" | |
# Create a column describing the rough place of the code2013 langs | |
code2013_lang_table$code2013_tier = ordered(c(rep("1-5", 5), rep("6-10", 5), rep("11-15", 5), | |
rep("16-25", 10), rep("26-35", 10)), levels=c("1-5", "6-10", "11-15", "16-25", "26-35")) | |
# Order by the TIOBE rankings | |
code2013_lang_table$code2013_langs = ordered(rownames(code2013_lang_table), | |
levels=rev(tiobe[, "lang"])) | |
code2013_lang_table$code2013_rank = 1:nrow(code2013_lang_table) | |
code2013_lang_table$tiobe_rank = match(code2013_lang_table$code2013_langs, tiobe[, "lang"]) | |
png(file="updated_code2013_tiobe_scatter.png", width=640, height=640) | |
ggplot(code2013_lang_table, aes(x=code2013_rank, y=tiobe_rank, color=code2013_tier)) + | |
geom_text(aes(label=code2013_langs), size=3.5) + | |
ylab("TIOBE Rank") + xlab("Updated #code2013 Rank") + | |
ggtitle("Updated #code2013 vs TIOBE rankings") | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment