Last active
January 16, 2018 08:48
-
-
Save milesgrimshaw/5997603 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stringr) | |
library(lubridate) | |
library(ggplot2) | |
library(scales) | |
getwd() | |
setwd("~/Desktop/Personal_Projects/Immersion/") | |
i <- read.csv("./immersion.csv", header=TRUE, as.is=TRUE) | |
## Only get those emails that I sent | |
i <- i[which(grepl("From: Miles",i$Text)),] | |
## Eliminate those that aren't EST | |
i <- i[-which(grepl("\\(",i$Date)),] | |
## Eliminate white space | |
i$Date <- str_trim(i$Date, side="both") | |
## String split on , | |
f <- function(s) strsplit(s,",")[[1]][2] | |
i$Date <- sapply(i$Date,f) | |
i$Date <- str_trim(i$Date, side="both") | |
## Clean up the dates with days of month <10 | |
table(nchar(i$Date)) | |
i$Date[which(nchar(i$Date)==25)] <- paste("0",i$Date[which(nchar(i$Date)==25)],sep="") | |
table(nchar(i$Date)) | |
## Create time stamps | |
i$Time <- as.POSIXct(i$Date,format='%d %b %Y %H:%M:%S %z', tz="America/New_York") | |
## Plot Distribution of Count Over Time | |
pdf(file="Emails_Count_Time.pdf",width=13,height=10) | |
ggplot(i, aes(x=Time)) + geom_histogram(binwidth = 60*60*24*7, aes(fill = ..count..)) + | |
scale_fill_gradient("Count", low = "skyblue", high = "blue") + | |
xlab("Date") + ylab("Count") + ggtitle("Miles' Sent Emails by Week") + theme_bw() + | |
scale_x_datetime(breaks = "5 months", minor_breaks = "1 month",labels = date_format("%m/%Y")) + | |
scale_y_continuous(expand=c(0,0)) | |
dev.off() | |
## Get Week Day and Hour | |
i$week_day <- wday(i$Time, label=TRUE, abbr=FALSE) | |
## Calculate Precise Minute Of The Day | |
i$min <- minute(i$Time) | |
i$hour <- hour(i$Time) | |
i$t <- (i$hour*60+i$min) | |
i$t <- i$t/60 | |
summary(i$t) | |
## Just get the date | |
i$date_graph <- as.Date(i$Time,format='%d%b%Y') | |
## Create Post Header Plot | |
pdf(file="Header.pdf",width=13,height=10) | |
ggplot(i, aes(x=date_graph, y=t)) + theme_bw() + | |
theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.ticks = element_blank(), | |
axis.text.x = element_blank(),axis.text.y = element_blank()) + | |
geom_point(color="red") + | |
scale_y_continuous(breaks = seq(0, 30, by = 1),expand = c(0,0)) + | |
scale_x_date(breaks = "4 months", minor_breaks = "2 month",labels = date_format("%m/%Y")) | |
dev.off() | |
################## TWITTER ######################## | |
t <- read.csv("./Twitter_Data/tweets/tweets.csv", header=TRUE, as.is=TRUE) | |
## Create Time Variable | |
t$Time <- as.POSIXct(t$timestamp,format='%Y-%m-%d %H:%M:%S %z',tz="America/New_York") | |
## Bind Email and Twitter Data Together | |
t2 <- as.data.frame(t$Time) | |
colnames(t2) <- "Time" | |
i2 <- as.data.frame(i$Time) | |
colnames(i2) <- "Time" | |
## Set Factors For the Data Sets | |
t2$set <- "Twitter" | |
i2$set <- "Gmail" | |
## Merge Data Frames | |
a <- rbind(i2, t2) | |
## Remove NAs | |
which(is.na(a$Time)) | |
a <- a[!is.na(a$Time),] | |
## Create Time and Date Variable For Graphing | |
a$hour <- hour(a$Time) | |
a$min <- minute(a$Time) | |
a$t <- (a$hour*60 + a$min) | |
a$t <- a$t/60 | |
summary(a$t) | |
a$date <- as.Date(a$Time,format='%d%b%Y') | |
a$set <- as.factor(a$set) | |
colnames(a)[2] <- "Medium" | |
a$week_day <- wday(a$Time, label=TRUE, abbr=FALSE) | |
## Graph Combination of Email & Twitter By Posting Time Over Time | |
pdf(file="All_Time.pdf",width=13,height=10) | |
ggplot(a, aes(x=date, y=t)) + | |
geom_point(aes(colour=Medium)) + xlab("Date") + ylab("Hour") + | |
ggtitle("Miles' Sent Emails & Posted Tweets by Time of Day") + | |
scale_y_continuous(breaks = seq(0, 30, by = 1),expand = c(0,0)) + theme_bw() + | |
scale_x_date(breaks = "4 months", minor_breaks = "2 month",labels = date_format("%m/%Y")) | |
dev.off() | |
########### Compare Time Periods | |
## Work | |
start_work <- as.POSIXct('2013-06-04', format='%Y-%m-%d') | |
samp_work <- a[which(a$Time>start_work),] | |
## Plot Over Time | |
pdf(file="Work_Over_Time.pdf",width=13,height=10) | |
ggplot(samp_work, aes(x=date, y=t)) + theme_bw() + | |
geom_point(aes(colour=Medium)) + xlab("Date") + ylab("Hour") + | |
ggtitle("Miles' Sent Emails & Posted Tweets Since June 4th, 2013 by Time of Day ") + | |
scale_y_continuous(breaks = seq(0, 30, by = 1),expand = c(0,0)) + | |
scale_x_date(breaks = "1 week",labels = date_format("%m/%d")) | |
dev.off() | |
## Plot by Day | |
pdf(file="Work_By_Day.pdf",width=13,height=10) | |
ggplot(samp_work, aes(t)) + geom_bar(binwidth=(0.25),aes(fill=Medium)) + facet_wrap(~week_day, scales="free_x") + theme_bw() + | |
ggtitle("Miles' Sent Emails & Posted Tweets Since June 4th, 2013 by Hour") + xlab("Hour") + ylab("Count") + | |
scale_x_continuous(breaks = seq(0, max(samp_work$hour), by = 4),expand = c(0,0)) | |
dev.off() | |
## School | |
start_semester <- as.POSIXct('2013-01-14', format='%Y-%m-%d') | |
end_semester <- as.POSIXct('2013-05-06', format='%Y-%m-%d') | |
samp_school <- a[which(a$Time>start_semester & a$Time < end_semester),] | |
## Plot Over Time | |
pdf(file="Senior_Spring_Over_Time.pdf",width=13,height=10) | |
ggplot(samp_school, aes(x=date, y=t)) + theme_bw() + | |
geom_point(aes(colour=Medium)) + xlab("Date") + ylab("Hour") + | |
ggtitle("Miles' Sent Emails & Posted Tweets During Senior Spring by Time of Day") + | |
scale_y_continuous(breaks = seq(0, 30, by = 1),expand = c(0,0)) + | |
scale_x_date(breaks = "1 week",labels = date_format("%m/%d")) | |
dev.off() | |
## Plot By Day | |
pdf(file="Senior_Spring_By_Day.pdf",width=13,height=10) | |
ggplot(samp_school, aes(t)) + geom_bar(binwidth=(0.25),aes(fill=Medium)) + | |
facet_wrap(~week_day, scales="free_x") + theme_bw() + | |
ggtitle("Miles' Sent Emails & Posted Tweets During Senior Spring by Hour") + xlab("Hour") + ylab("Count") + | |
scale_x_continuous(breaks = seq(0, max(samp_work$hour), by = 4),expand = c(0,0)) | |
dev.off() | |
## School | |
start_semester_j <- as.POSIXct('2012-01-14', format='%Y-%m-%d') | |
end_semester_j <- as.POSIXct('2012-05-06', format='%Y-%m-%d') | |
samp_school_j <- a[which(a$Time>start_semester_j & a$Time < end_semester_j),] | |
## Plot Over Time | |
pdf(file="Junior_Spring_Over_Time.pdf",width=13,height=10) | |
ggplot(samp_school_j, aes(x=date, y=t)) + theme_bw() + | |
geom_point(aes(colour=Medium)) + xlab("Date") + ylab("Hour") + | |
ggtitle("Miles' Sent Emails & Posted Tweets During Junior Spring by Time of Day") + | |
scale_y_continuous(breaks = seq(0, 30, by = 1),expand = c(0,0)) + | |
scale_x_date(breaks = "1 week",labels = date_format("%m/%d")) | |
dev.off() | |
## Plot By Day | |
pdf(file="Junior_Spring_By_Day.pdf",width=13,height=10) | |
ggplot(samp_school_j, aes(t)) + geom_bar(binwidth=(0.25),aes(fill=Medium)) + | |
facet_wrap(~week_day, scales="free_x") + theme_bw() + | |
ggtitle("Miles' Sent Emails & Posted Tweets During Senior Spring by Hour") + xlab("Hour") + ylab("Count") + | |
scale_x_continuous(breaks = seq(0, max(samp_work$hour), by = 4),expand = c(0,0)) | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment