Skip to content

Instantly share code, notes, and snippets.

@milesgrimshaw
Last active January 16, 2018 08:48
Show Gist options
  • Save milesgrimshaw/5997603 to your computer and use it in GitHub Desktop.
Save milesgrimshaw/5997603 to your computer and use it in GitHub Desktop.
library(stringr)
library(lubridate)
library(ggplot2)
library(scales)
getwd()
setwd("~/Desktop/Personal_Projects/Immersion/")
i <- read.csv("./immersion.csv", header=TRUE, as.is=TRUE)
## Only get those emails that I sent
i <- i[which(grepl("From: Miles",i$Text)),]
## Eliminate those that aren't EST
i <- i[-which(grepl("\\(",i$Date)),]
## Eliminate white space
i$Date <- str_trim(i$Date, side="both")
## String split on ,
f <- function(s) strsplit(s,",")[[1]][2]
i$Date <- sapply(i$Date,f)
i$Date <- str_trim(i$Date, side="both")
## Clean up the dates with days of month <10
table(nchar(i$Date))
i$Date[which(nchar(i$Date)==25)] <- paste("0",i$Date[which(nchar(i$Date)==25)],sep="")
table(nchar(i$Date))
## Create time stamps
i$Time <- as.POSIXct(i$Date,format='%d %b %Y %H:%M:%S %z', tz="America/New_York")
## Plot Distribution of Count Over Time
pdf(file="Emails_Count_Time.pdf",width=13,height=10)
ggplot(i, aes(x=Time)) + geom_histogram(binwidth = 60*60*24*7, aes(fill = ..count..)) +
scale_fill_gradient("Count", low = "skyblue", high = "blue") +
xlab("Date") + ylab("Count") + ggtitle("Miles' Sent Emails by Week") + theme_bw() +
scale_x_datetime(breaks = "5 months", minor_breaks = "1 month",labels = date_format("%m/%Y")) +
scale_y_continuous(expand=c(0,0))
dev.off()
## Get Week Day and Hour
i$week_day <- wday(i$Time, label=TRUE, abbr=FALSE)
## Calculate Precise Minute Of The Day
i$min <- minute(i$Time)
i$hour <- hour(i$Time)
i$t <- (i$hour*60+i$min)
i$t <- i$t/60
summary(i$t)
## Just get the date
i$date_graph <- as.Date(i$Time,format='%d%b%Y')
## Create Post Header Plot
pdf(file="Header.pdf",width=13,height=10)
ggplot(i, aes(x=date_graph, y=t)) + theme_bw() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank(), axis.ticks = element_blank(),
axis.text.x = element_blank(),axis.text.y = element_blank()) +
geom_point(color="red") +
scale_y_continuous(breaks = seq(0, 30, by = 1),expand = c(0,0)) +
scale_x_date(breaks = "4 months", minor_breaks = "2 month",labels = date_format("%m/%Y"))
dev.off()
################## TWITTER ########################
t <- read.csv("./Twitter_Data/tweets/tweets.csv", header=TRUE, as.is=TRUE)
## Create Time Variable
t$Time <- as.POSIXct(t$timestamp,format='%Y-%m-%d %H:%M:%S %z',tz="America/New_York")
## Bind Email and Twitter Data Together
t2 <- as.data.frame(t$Time)
colnames(t2) <- "Time"
i2 <- as.data.frame(i$Time)
colnames(i2) <- "Time"
## Set Factors For the Data Sets
t2$set <- "Twitter"
i2$set <- "Gmail"
## Merge Data Frames
a <- rbind(i2, t2)
## Remove NAs
which(is.na(a$Time))
a <- a[!is.na(a$Time),]
## Create Time and Date Variable For Graphing
a$hour <- hour(a$Time)
a$min <- minute(a$Time)
a$t <- (a$hour*60 + a$min)
a$t <- a$t/60
summary(a$t)
a$date <- as.Date(a$Time,format='%d%b%Y')
a$set <- as.factor(a$set)
colnames(a)[2] <- "Medium"
a$week_day <- wday(a$Time, label=TRUE, abbr=FALSE)
## Graph Combination of Email & Twitter By Posting Time Over Time
pdf(file="All_Time.pdf",width=13,height=10)
ggplot(a, aes(x=date, y=t)) +
geom_point(aes(colour=Medium)) + xlab("Date") + ylab("Hour") +
ggtitle("Miles' Sent Emails & Posted Tweets by Time of Day") +
scale_y_continuous(breaks = seq(0, 30, by = 1),expand = c(0,0)) + theme_bw() +
scale_x_date(breaks = "4 months", minor_breaks = "2 month",labels = date_format("%m/%Y"))
dev.off()
########### Compare Time Periods
## Work
start_work <- as.POSIXct('2013-06-04', format='%Y-%m-%d')
samp_work <- a[which(a$Time>start_work),]
## Plot Over Time
pdf(file="Work_Over_Time.pdf",width=13,height=10)
ggplot(samp_work, aes(x=date, y=t)) + theme_bw() +
geom_point(aes(colour=Medium)) + xlab("Date") + ylab("Hour") +
ggtitle("Miles' Sent Emails & Posted Tweets Since June 4th, 2013 by Time of Day ") +
scale_y_continuous(breaks = seq(0, 30, by = 1),expand = c(0,0)) +
scale_x_date(breaks = "1 week",labels = date_format("%m/%d"))
dev.off()
## Plot by Day
pdf(file="Work_By_Day.pdf",width=13,height=10)
ggplot(samp_work, aes(t)) + geom_bar(binwidth=(0.25),aes(fill=Medium)) + facet_wrap(~week_day, scales="free_x") + theme_bw() +
ggtitle("Miles' Sent Emails & Posted Tweets Since June 4th, 2013 by Hour") + xlab("Hour") + ylab("Count") +
scale_x_continuous(breaks = seq(0, max(samp_work$hour), by = 4),expand = c(0,0))
dev.off()
## School
start_semester <- as.POSIXct('2013-01-14', format='%Y-%m-%d')
end_semester <- as.POSIXct('2013-05-06', format='%Y-%m-%d')
samp_school <- a[which(a$Time>start_semester & a$Time < end_semester),]
## Plot Over Time
pdf(file="Senior_Spring_Over_Time.pdf",width=13,height=10)
ggplot(samp_school, aes(x=date, y=t)) + theme_bw() +
geom_point(aes(colour=Medium)) + xlab("Date") + ylab("Hour") +
ggtitle("Miles' Sent Emails & Posted Tweets During Senior Spring by Time of Day") +
scale_y_continuous(breaks = seq(0, 30, by = 1),expand = c(0,0)) +
scale_x_date(breaks = "1 week",labels = date_format("%m/%d"))
dev.off()
## Plot By Day
pdf(file="Senior_Spring_By_Day.pdf",width=13,height=10)
ggplot(samp_school, aes(t)) + geom_bar(binwidth=(0.25),aes(fill=Medium)) +
facet_wrap(~week_day, scales="free_x") + theme_bw() +
ggtitle("Miles' Sent Emails & Posted Tweets During Senior Spring by Hour") + xlab("Hour") + ylab("Count") +
scale_x_continuous(breaks = seq(0, max(samp_work$hour), by = 4),expand = c(0,0))
dev.off()
## School
start_semester_j <- as.POSIXct('2012-01-14', format='%Y-%m-%d')
end_semester_j <- as.POSIXct('2012-05-06', format='%Y-%m-%d')
samp_school_j <- a[which(a$Time>start_semester_j & a$Time < end_semester_j),]
## Plot Over Time
pdf(file="Junior_Spring_Over_Time.pdf",width=13,height=10)
ggplot(samp_school_j, aes(x=date, y=t)) + theme_bw() +
geom_point(aes(colour=Medium)) + xlab("Date") + ylab("Hour") +
ggtitle("Miles' Sent Emails & Posted Tweets During Junior Spring by Time of Day") +
scale_y_continuous(breaks = seq(0, 30, by = 1),expand = c(0,0)) +
scale_x_date(breaks = "1 week",labels = date_format("%m/%d"))
dev.off()
## Plot By Day
pdf(file="Junior_Spring_By_Day.pdf",width=13,height=10)
ggplot(samp_school_j, aes(t)) + geom_bar(binwidth=(0.25),aes(fill=Medium)) +
facet_wrap(~week_day, scales="free_x") + theme_bw() +
ggtitle("Miles' Sent Emails & Posted Tweets During Senior Spring by Hour") + xlab("Hour") + ylab("Count") +
scale_x_continuous(breaks = seq(0, max(samp_work$hour), by = 4),expand = c(0,0))
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment