Skip to content

Instantly share code, notes, and snippets.

@thomasjensen
Created January 6, 2012 21:13
Show Gist options
  • Save thomasjensen/1572432 to your computer and use it in GitHub Desktop.
Save thomasjensen/1572432 to your computer and use it in GitHub Desktop.
analysing the data scraped from r-bloggers.com
#read the libraries
library(plyr)
library(ggplot2)
library(xtable)
#set the working direcotry to where you saved the output.csv file from the previous post
setwd("/.../")
#read the data
data <- read.csv("output.csv")
#define the date variable and create the year and month variables
data$date <- as.Date(data$date, format = "%B %d %Y")
data$year <- as.POSIXlt(data$date)$year + 1900
data$month <- as.POSIXlt(data$date)$mon + 1
#get the monthly count of posts for every year
posts <- ddply(data, c("year","month"), function(x) data.frame(count = nrow(x)))
#for easier plotting create a date variable from the year and month
dates <- paste(posts$year,posts$month,"01", sep = "-")
posts$date <- as.Date(dates, format = "%Y-%m-%d")
#plot the monthly post count
plot <- ggplot(posts, aes(x = date, y = count)) + geom_line() + theme_bw() + ylab("Post Count")
plot
#get the number of monthly contributors
contributors <- ddply(data,c("year","month"), function(x) data.frame(contributors = length(unique(x$author))))
#for easier plotting create a date variable from the year and month
dates <- paste(contributors$year,contributors$month,"01", sep = "-")
contributors$date <- as.Date(dates, format = "%Y-%m-%d")
#plot the monthly count of contributors
plot <- ggplot(contributors, aes(x = date, y = contributors)) + geom_line() + theme_bw()
plot
#get the number of posts per author
authors <- ddply(data, "author", function(x) data.frame(count = nrow(x)))
#plot the density of contributions per author
plot <- ggplot(authors, aes(x = count)) +
geom_density(fill = "red", alpha = .3) +
theme_bw() +
opts(axis.ticks = theme_blank(), axis.text.x = theme_blank())
plot
#get the ten authors with the highest post count
topten <- authors[order(authors$count, decreasing = TRUE)[1:10],]
print(xtable(topten), type = "html", include.rownames = FALSE)
#get the post of authors for every year
authorsYear<- ddply(data, c("author","year"), function(x) data.frame(count = nrow(x)))
#for every year get a table of the ten most prolific authors and print it as html
for (year in unique(authorsYear$year)){
print(year)
table <- authorsYear[authorsYear$year == year,]
table <- table[order(table$count, decreasing = TRUE)[1:10],]
print(xtable(table[,c("author","count")]), type = "html", include.rownames = FALSE)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment