Skip to content

Instantly share code, notes, and snippets.

@milesgrimshaw
Created May 6, 2013 03:52
Show Gist options
  • Save milesgrimshaw/5523271 to your computer and use it in GitHub Desktop.
Save milesgrimshaw/5523271 to your computer and use it in GitHub Desktop.
R code for Personal Data Analytics
### Miles Grimshaw Code For Blog Post on Personal Data
### May 5th 2013
getwd()
setwd("~/Desktop/x_Final_Project")
library(stringr)
library(lubridate)
library(ggplot2)
############## TWITTER ##################
t <- read.csv("./Data/tweets.csv", header=TRUE, as.is=TRUE)
names(t)
t$timestamp[1:10]
### Elininate to the date and time
t$timestamp_clean <- str_sub(t$timestamp, 1, -7)
t$timestamp_clean[1:10]
### Make understandable as a date by R
t$tweets_datetime <- format(ymd_hms(t$timestamp_clean, quiet = TRUE), tz="America/New_York",usetz=TRUE)
t$tweets_datetime[1:10]
t$date <- substring(t$timestamp, 1, 10)
t$date[1:10]
t$date <- as.POSIXct(t$date,format='%Y-%m-%d', tz="America/New_York")
### Plot of tweets over time
pdf(file="Tweets_Over_Time.pdf",width=11,height=8.5)
ggplot(t, aes(x=date)) + geom_histogram(binwidth = 60*60*24*7, aes(fill = ..count..)) +scale_fill_gradient("Count", low = "skyblue", high = "blue") + xlab("Date") + ggtitle("Miles' Twitter output by week")
dev.off()
### Create day of the week and hour variables
t$day_of_week <- wday(t$tweets_datetime, label = TRUE, abbr = FALSE)
t$day_of_week[1:10]
t$hour <- hour(t$tweets_datetime)
t$hour[1:10]
t$am_pm <- ifelse(am(t$tweets_datetime), "AM", "PM")
t$am_pm[1:10]
### AM VS PM BY DAY OF WEEK
pdf(file="Tweets_AMPM_Day.pdf",width=11,height=8.5)
counts <- table(t$am_pm, t$day_of_week)
barplot(counts, beside=TRUE, xlab="Day of the Week", ylab="Count",
main="Tweets by Weekday Seperated by AM vs PM", cex.names=0.8, col=c('lightblue', 'navy'))
legend(x=0.5,y=300, rownames(counts), fill=c('lightblue','navy'), bty='n',
adj=0.4, cex=1.2)
dev.off()
### Distribution by Day
which(is.na(t$day_of_week))
t <- t[which(!is.na(t$day_of_week)),]
pdf(file="Tweets_Hours_Day.pdf",width=11,height=8.5)
ggplot(t, aes(hour)) + geom_bar() + facet_wrap(~ day_of_week) +
ggtitle("Miles' Tweets by Hour and Day of The Week") + xlab("Hour") + ylab("Count")
dev.off()
daytime <- table(t$hour, t$day_of_week)
dfdaytime <- as.data.frame(daytime)
pdf(file="Tweets_Heatmap.pdf",width=11,height=8.5)
ggplot(dfdaytime, aes(x=Var2, y=Var1, fill=Freq)) + geom_tile() +
scale_fill_gradient(low="white", high="red") +
ggtitle("Heatmap of Miles' Tweets") + xlab("Day of week") + ylab ("Hour ofthe Day")
dev.off()
### WORD CLOURS
library(tm)
library(wordcloud)
# Clean up tweets
text <- tolower(t$text)
text <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", text)
## Might not want to get rid of people for wordcloud
text <- gsub("@\\w+", "", text)
# remove html links
text <- gsub("http\\w+", "", text)
text <- removeWords(text, stopwords(kind="en"))
text <- removeWords(text, c("rt", "milesgrimshaw", "http", "twitter"))
text <- removePunctuation(text)
text <- removeNumbers(text)
## OR
# text <- gsub("[[:punct:]]", "", text)
# text <- gsub("[[:digit:]]", "", text)
text <- stripWhitespace(text)
# text <- gsub("[ \t]{2,}", "", text)
# text <- gsub("^\\s+|\\s+$", "", text)
# To use w/ TM package have to create a corpus
# and from that document term matrix (DTM)
text_corp <- Corpus(VectorSource(text))
text_dtm <- DocumentTermMatrix(text_corp)
inspect(text_dtm[1:2,50:60])
findFreqTerms(text_dtm, 20)
word_freq <- colSums(as.array(text_dtm))
word_freq[1:20]
# Create wordcloud of top 100 most frequent words
pdf(file="Tweets_Wordcloud.pdf",width=11,height=8.5)
wordcloud(names(word_freq), word_freq, scale=c(3.2,.2),
max.words = 100, random.order=FALSE,
colors=c("gray", "darkgreen", "darkblue"), rot.per=.3)
dev.off()
############## FACEBOOK ##################
### FUNCTIONS
get_day <- function (x) {
if (grepl('Monday', x)) return (1)
else if (grepl('Tuesday', x)) return (2)
else if (grepl('Wednesday', x)) return (3)
else if (grepl('Thursday', x)) return (4)
else if (grepl('Friday', x)) return (5)
else if (grepl('Saturday', x)) return (6)
else if (grepl('Sunday', x)) return (7)
else return (NA)
}
get_time <- function (x) {
n <- nchar(x)
time <- substring(x,(n-11),n)
time <- gsub("</td>", "", time)
return (time)
}
## Read in the html
f <- scan("./Data/Facebook/html/account_activity.html", what="", sep="\n")
f2 <- unlist(strsplit(f, '<td>', fixed = TRUE))
f2 <- f2[-1]
head(f2)
# The dates are every 5th column
f2[c(1,6,11,16,21)]
l <- length(f2)
data <- f2[seq(1, length(f2), 5)]
which(is.na(data))
l <- length(data)
data[1:30]
## Manually create factors for 'today' and 'yesterday'
day1 <- rep(0,13)
## 1 = Monday
## 7 = Sunday
## The first 9 days say 'Today' which is Thursday
day1[1:9] <- rep(4)
day1[10:13] <- rep(3)
data_subset <- data[14:l]
data_subset[1:10]
l_sub <- length(data_subset)
day2 <- rep(NA,l_sub)
which(is.na(data_subset))
## Get the dates
day2 <- lapply(data_subset, get_day)
day2 <- unlist(day2)
## Bind the manually created dates and stripped dates together
day <- c(day1,day2)
count <- table(day)
barplot(count)
## Get the time
# l is length of original data
time <- rep(NA,l)
time <- lapply(data, get_time)
time <- unlist(time)
## Convert AM/PM to 24hr understandable by R
time <- strptime(time, "%I:%M%p", tz="America/New_York")
## Create a data frame with the days and times aligned
ff <- data.frame(day, time)
head(ff)
## Create factors and rename the factor levels for the day of the week
ff$day2 <- factor(ff$day)
levels(ff$day2) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
## Strip the time down to the hour
ff$hour <- hour(ff$time)
ff$hour[1:10]
### Distribution by Day
pdf(file="Facebook_Day_Time.pdf",width=11,height=8.5)
ggplot(ff, aes(hour)) + geom_bar() + facet_wrap(~ day2) +
ggtitle("Miles' Facebook Usage by Hour and Day of The Week") +
xlab("Hour of the Day") + ylab("Count")
dev.off()
## Create a new data frame so can make the heatmap
daytime <- table(ff$hour, ff$day2)
dfdaytime <- as.data.frame(daytime)
pdf(file="Facebook_Heatmap.pdf",width=11,height=8.5)
ggplot(dfdaytime, aes(x=Var2, y=Var1, fill=Freq)) + geom_tile() +
scale_fill_gradient(low="white", high="red") +
ggtitle("Heatmap of Miles' Facebook Usage") + xlab("Day of week") + ylab ("Hour of day")
dev.off()
############## OPEN PATHS ##################
library(maps)
library(geosphere)
## Read in the data
l <- read.csv("./Data/openpaths.csv", header=TRUE, as.is=TRUE)
head(l)
## Only select cases where I actually moved
l2 <- l[which(l$alt!=0),]
head(l2)
which(l2$alt>100)
test <- l2[which(l2$alt>100),]
names(test)
which(is.na(test$lon))
## Get the states I want to map
states <- c('new york', 'connecticut', 'massachusetts', 'new hampshire', 'vermont')
## Map just the specific states
pdf(file="Paths_States.pdf",width=11,height=8.5)
map('county',states, col="#f2f2f2", fill=TRUE, bg="white", lwd=0.01)
## Map the country
#pdf(file="Paths_USA.pdf",width=11,height=8.5)
#map('state',col="#f2f2f2", fill=TRUE, bg="white", lwd=0.01)
for (i in 1:(nrow(l)-1)) {
lat_1 <- test$lat[i]
lon_1 <- test$lon[i]
lat_2 <- test$lat[i+1]
lon_2 <- test$lon[i+1]
if (lat_1 != lat_2 & lon_1 != lon_2) {
inter <- gcIntermediate(c(lon_1, lat_1), c(lon_2, lat_2), n=10, addStartEnd=TRUE)
lines(inter, col="red", lwd=0.6)
}
}
#title("Miles' Movements March 2012 - May 2013")
title("Miles' North East Movements March 2012 - May 2013")
dev.off()
#### SOURCES & HAT TIPS ###
http://www.scienceisdelicious.net/?p=407
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment