Skip to content

Instantly share code, notes, and snippets.

@milesgrimshaw
Created February 11, 2014 19:13
Show Gist options
  • Save milesgrimshaw/8941929 to your computer and use it in GitHub Desktop.
Save milesgrimshaw/8941929 to your computer and use it in GitHub Desktop.
Analysis of personal Evernote notes archive
# Load packages
library(ggplot2)
library(lubridate)
library(stringr)
library(scales)
# Set the working directory
getwd()
setwd("~/Dropbox/Personal/Projects/Evernote_Analysis/files/")
######### FUNCTIONS ############
# Receives a file
# Returns an array with all the dates extracted
get_dates_counts <- function(file) {
# Create an empty array for storing the dates
times <- vector()
# Find all the lines that have 'created' in them which therefore includes dates
# Notes go from <note> to </note>
note_begins <- grep('.*<note>.*',file)
note_ends <- grep('.*</note>.*',file)
# All the lines for when notes were created
time_lines <- grep('.*<created>([0-9]+T[0-9]+Z)?</created>.*',file)
# Select just those lines
select_time_lines <- file[time_lines]
# Extract the created date for each of them
times <- sapply(select_time_lines, function(l) gsub('.*<created>([0-9]+T[0-9]+Z)?</created>.*','\\1', l), USE.NAMES=FALSE)
## Get the word count for each note
# Create an empty vector for storing the word counts for each note
counts <- vector()
# Iterate over the lines of each note
for (z in 1:length(lines)) {
# Set beginning and end of lines for the given note
note_begin <- note_begins[z]
note_end <- note_ends[z]
note_lines <- file[note_begin:note_end]
# Have to get rid of any data files
# Find the beginning of data sections
data_start <- grep('.*<data .*',note_lines)
data_end <- grep('.*</data>.*',note_lines)
text_note_lines <- vector()
# Delete lines between them
if (length(data_start)>=1) {
for (y in 1:(length(data_start)+1)) {
if (y==1) {
begin <- 1
end <- data_start[y]-1
text_note_lines <- c(text_note_lines,note_lines[begin:end])
}
else if (y==(length(data_start)+1)) {
begin <- data_end[y-1]+1
end <- length(note_lines)
text_note_lines <- c(text_note_lines,note_lines[begin:end])
}
else {
begin <- data_end[y-1]+1
end <- data_start[y]-1
if (begin < end) {
text_note_lines <- c(text_note_lines,note_lines[begin:end])
}
else if (begin == end) {
text_note_lines <- c(text_note_lines,note_lines[begin])
}
}
}
} else text_note_lines <- note_lines
# Have to get rid of html chars
text_note_lines <- sapply(text_note_lines, function(t) gsub("<(.*?)>",'', t), USE.NAMES=FALSE)
# Count the number of words per line
nums <- sapply(text_note_lines, function(t) length(str_match_all(t,"\\S+")[[1]]), USE.NAMES=FALSE)
# Sum over all the lines
counts <- append(counts,sum(nums))
}
# Create a data frame from times and counts
df_new <- data.frame(times,counts)
# Return the array of dates
return (df_new)
}
######### SCRIPT ############
# Get all the files from the archive
files <- list.files()
# Get the total number of files
num <- length(files)
# Create an empty data frame to store all the dates and word counts
df <- data.frame(time=vector(),count=vector())
# Iterate over every file to compile a single array with all the dates
for (i in 1:num) {
file <- scan(files[i],what="", sep="\n")
df <- rbind(df, get_dates_counts(file))
}
# Rename the data frame columns
names(df) <- c("time", "count")
# Create a formatted time stamp
df$time <- as.POSIXct(sapply(df$time, function(t) as.POSIXct(t,format="%Y%m%dT%H%M%SZ", tz="GMT"), USE.NAMES=FALSE), origin="1970-01-01")
# Create an additional column to store the cumulative number of notes created over time
df$note_count <- sapply(df$time, function(t) sum(df$time <= t),USE.NAMES=FALSE)
# Create an additional column to store the cumulative sum of words written over time
df$word_count <- sapply(df$time, function(t) sum(df$count[which(df$time <= t)]) )
# To customer scale the X-axis time stamp labels needs to be in Date vs Posix format
df_2 <- df
df_2$time <- as.Date(df_2$time)
# Reset the working director for saving images
setwd("~/Dropbox/Personal/Projects/Evernote_Analysis/")
# Create a graphic of the number of notes I have created by week
pdf(file="New_Notes_By_Week.pdf",width=11,height=8.5)
ggplot(df, aes(x=time)) + geom_histogram(binwidth = 60*60*24*7,fill="blue") +
xlab("Date") + ylab("Number Of New Notes") + ggtitle("Miles' New Evernotes By Week")
dev.off()
# To calculate the linear regression have to zero the x-axis time stamp
df_3 <- df_2
df_3$time <- as.double(df_3$time)
df_3$time <- df_3$time - min(df_3$time)
# Create a linear model for number of notes by time
fit <- lm(note_count ~ 0 + time, data=df_3)
summary(fit)
z <- coef(fit)
# Graph the total number of notes created over time
pdf(file="Total_Notes_Over_Time.pdf",width=22,height=17)
ggplot(df_2, aes(time, note_count)) + geom_point() +
ylab("Total Number of Notes") + xlab("Date") + ggtitle("Total Number Of Notes Created Over Time") +
scale_x_date(labels = date_format("%b-%Y"), breaks = date_breaks("2 months")) +
geom_abline(intercept = (-z*as.double(min(df_2$time))), slope=z, colour="red") +
geom_vline(xintercept = as.numeric(as.Date("2013-10-21")), colour="blue", linetype = "longdash")
dev.off()
# Create a linear regression for words vs. time
fit <- lm(word_count ~ 0 + time, data=df_3)
summary(fit)
z <- coef(fit)
# Visualize the total number of words vs. time
pdf(file="Total_Words_Over_Time.pdf",width=22,height=17)
ggplot(df_2, aes(time, word_count)) + geom_point() +
ylab("Total Number of Words") + xlab("Date") + ggtitle("Total Number Of Words Written Over Time") +
scale_x_date(labels = date_format("%b-%Y"), breaks = date_breaks("2 months")) +
geom_abline(intercept = (-z*as.double(min(df_2$time))), slope=z, colour="red") +
geom_vline(xintercept = as.numeric(as.Date("2013-10-21")), colour="blue", linetype = "longdash")
dev.off()
# Create a linear regression for word count vs. note count
fit <- lm(word_count ~ 0 + note_count, data=df_2)
summary(fit)
z <- coef(fit)
# How many notebooks did I have when I became an enterprise client
# The 18th is the closest date I have for having created a notebook
num_notes <- df_2$note_count[which(df_2$time == as.Date("2013-10-18"))]
# Visualize the number of words vs. number of notes by time
pdf(file="Total_Words_Over_Notebooks.pdf",width=22,height=17)
ggplot(df_2, aes(note_count, word_count)) + geom_point() +
ylab("Total Number of Words") + xlab("Notes") + ggtitle("Total Number Of Words Written vs. Number Of Notes") +
geom_abline(slope=z, colour="red") +
geom_vline(xintercept = num_notes, colour="blue", linetype = "longdash")
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment