Created
January 6, 2012 21:13
-
-
Save thomasjensen/1572432 to your computer and use it in GitHub Desktop.
analysing the data scraped from r-bloggers.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#read the libraries | |
library(plyr) | |
library(ggplot2) | |
library(xtable) | |
#set the working direcotry to where you saved the output.csv file from the previous post | |
setwd("/.../") | |
#read the data | |
data <- read.csv("output.csv") | |
#define the date variable and create the year and month variables | |
data$date <- as.Date(data$date, format = "%B %d %Y") | |
data$year <- as.POSIXlt(data$date)$year + 1900 | |
data$month <- as.POSIXlt(data$date)$mon + 1 | |
#get the monthly count of posts for every year | |
posts <- ddply(data, c("year","month"), function(x) data.frame(count = nrow(x))) | |
#for easier plotting create a date variable from the year and month | |
dates <- paste(posts$year,posts$month,"01", sep = "-") | |
posts$date <- as.Date(dates, format = "%Y-%m-%d") | |
#plot the monthly post count | |
plot <- ggplot(posts, aes(x = date, y = count)) + geom_line() + theme_bw() + ylab("Post Count") | |
plot | |
#get the number of monthly contributors | |
contributors <- ddply(data,c("year","month"), function(x) data.frame(contributors = length(unique(x$author)))) | |
#for easier plotting create a date variable from the year and month | |
dates <- paste(contributors$year,contributors$month,"01", sep = "-") | |
contributors$date <- as.Date(dates, format = "%Y-%m-%d") | |
#plot the monthly count of contributors | |
plot <- ggplot(contributors, aes(x = date, y = contributors)) + geom_line() + theme_bw() | |
plot | |
#get the number of posts per author | |
authors <- ddply(data, "author", function(x) data.frame(count = nrow(x))) | |
#plot the density of contributions per author | |
plot <- ggplot(authors, aes(x = count)) + | |
geom_density(fill = "red", alpha = .3) + | |
theme_bw() + | |
opts(axis.ticks = theme_blank(), axis.text.x = theme_blank()) | |
plot | |
#get the ten authors with the highest post count | |
topten <- authors[order(authors$count, decreasing = TRUE)[1:10],] | |
print(xtable(topten), type = "html", include.rownames = FALSE) | |
#get the post of authors for every year | |
authorsYear<- ddply(data, c("author","year"), function(x) data.frame(count = nrow(x))) | |
#for every year get a table of the ten most prolific authors and print it as html | |
for (year in unique(authorsYear$year)){ | |
print(year) | |
table <- authorsYear[authorsYear$year == year,] | |
table <- table[order(table$count, decreasing = TRUE)[1:10],] | |
print(xtable(table[,c("author","count")]), type = "html", include.rownames = FALSE) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment