Instantly share code, notes, and snippets.
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
Save meefen/5024366 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------ | |
# Load libraries | |
require(stringr) | |
require(RCurl) | |
require(ggplot2) | |
#------------------------ | |
# Read and return Google Spreadsheet as csv | |
# The file should first be published on Google Drive | |
# Also note that Google Drive uses https, and the old | |
# function does not work anymore | |
gsqAPI <- function(key, gid=0) { | |
url <- paste(sep="", 'https://docs.google.com/spreadsheet/pub?key=', key, | |
'&single=true&gid=', gid, '&output=csv') | |
conn <- textConnection(getURL(url)) | |
archive <- read.csv(conn, stringsAsFactors=FALSE) | |
close(conn) | |
return(archive) | |
} | |
# Trim . and @ | |
# Note that some reply messages starts with ".@user" | |
trim <- function (x) { | |
sub('(\\.)?@', '', x) | |
} | |
# Count stats for users | |
twCounts <- function(df) { | |
print("Counting @'d users") | |
to.count <- data.frame(table(df$to)) | |
colnames(to.count) <- c('Name','toCount') | |
print('Counting senders') | |
from.count <- data.frame(table(df$from_user)) | |
colnames(from.count) <- c('Name','fromCount') | |
print('Counting rtof users') | |
rtof.count <- data.frame(table(df$rtof)) | |
colnames(rtof.count) <- c('Name','rtofCount') | |
print('Counting rtby users') | |
rtby.count <- data.frame(table(df$rtby)) | |
colnames(rtby.count) <- c('Name','rtbyCount') | |
print('Merging datasets') | |
counts <- merge(rtof.count,to.count,by="Name",all.x=TRUE) | |
counts <- merge(counts,rtby.count,all.x=TRUE) | |
counts <- merge(counts,from.count,all.x=TRUE) | |
counts[is.na(counts)] <- 0 | |
counts$Name <- factor(counts$Name) | |
return(counts) | |
} | |
# Parse tweet archive | |
twArchParse <- function(key,gid){ | |
print('Getting data') | |
df <- gsqAPI(key, gid) | |
print('Got data') | |
print('Parsing @ messages') | |
df$to <- sapply(df$text,function(tweet) trim(str_extract(tweet,"^((\\.)?(@[[:alnum:]_]*))"))) | |
print('Parsing RT: messages') | |
#THe str_match approach is really slow - I'm using it here rather than str_extract purely as a demo | |
df$rtof <- sapply(df$text,function(tweet) trim(str_match(tweet,"^[MR]T (@[[:alnum:]_]*)")[2])) | |
print('Parsing RT: senders') | |
df$rtby <- paste(df$rtof,df$from_user) | |
df$rtby <- sapply(df$rtby,function(dfx) if (word(dfx,1)=='NA') NA else word(dfx,2)) | |
return(df) | |
} | |
# Sort data for bar plot | |
barsorter <- function (dfc){ | |
htable <- table(dfc) | |
hlevels <- names(htable)[order(-htable)] | |
return(factor(dfc, levels = hlevels)) | |
} | |
#------------------------ | |
# Example usage: #ODDTO13 archive | |
key <- '0Aup6zwZoYbZ1dEZBeG83bTNlOXpxQVFDSklNQ2RjTEE' | |
gid <- 82 | |
# read and parse data | |
archive.data <- twArchParse(key, gid) | |
archive.data$id_str <- as.character(archive.data$id_str) | |
archive.data$from_user <- as.factor(archive.data$from_user) | |
archive.data$from_user_id_str <- as.character(archive.data$from_user_id_str) | |
archive.data$time <- as.POSIXlt(archive.data$time, tz = "GMT", format = "%d/%m/%Y %H:%M:%S") | |
archive.data$in_reply_to_user_id_str <- as.character(archive.data$in_reply_to_user_id_str) | |
archive.data$in_reply_to_screen_name <- as.factor(archive.data$in_reply_to_screen_name) | |
archive.data$in_reply_to_status_id_str <- as.character(archive.data$in_reply_to_status_id_str) | |
# compute user stats | |
archive.counts <- twCounts(archive.data) | |
# plot a bar chart of RT of counts | |
ggplot() + | |
geom_bar(aes(x=na.omit(archive.data$rtof))) + | |
theme(axis.text.x=element_text(angle=-90,size=9)) + | |
xlab("Users") | |
# sorted plot based on computed counts - "RT of" | |
archive.data$hrt <- barsorter(archive.data$rtof) | |
ggplot() + geom_bar(aes(x=na.omit(archive.data$hrt))) + | |
theme(axis.text.x=element_text(angle=-90,size=9)) + | |
xlab("Users") | |
# plot a bar chart of 'to' computed counts | |
ggplot() + geom_bar(aes(x=na.omit(archive.data$to))) + | |
theme(axis.text.x=element_text(angle=-90,size=9)) + | |
xlab("Users") | |
# plot a bar chart of 'from' computed counts | |
ggplot() + geom_bar(aes(x=na.omit(archive.data$from_user))) + | |
theme(axis.text.x=element_text(angle=-90,size=6)) + | |
xlab("Users") | |
# plot an ordered bar chart of 'from' tabulated counts | |
archive.counts$Name <- reorder(archive.counts$Name, archive.counts$toCount) | |
ggplot(archive.counts) + geom_bar(stat = "identity",aes(x=Name,y=toCount)) + | |
theme(axis.text.x=element_text(angle=-90,size=9)) + | |
xlab("Users") | |
# plot a scatterplot displaying to and from counts on x and ya axes, and label size as RT count | |
ggplot(na.omit(archive.counts)) + | |
geom_point(aes(x=fromCount, y=toCount, size=10)) + | |
geom_text(aes(x=fromCount, y=toCount, label=Name, size=rtofCount, angle=45)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment