Created
May 6, 2013 03:52
-
-
Save milesgrimshaw/5523271 to your computer and use it in GitHub Desktop.
R code for Personal Data Analytics
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Miles Grimshaw Code For Blog Post on Personal Data | |
### May 5th 2013 | |
getwd() | |
setwd("~/Desktop/x_Final_Project") | |
library(stringr) | |
library(lubridate) | |
library(ggplot2) | |
############## TWITTER ################## | |
t <- read.csv("./Data/tweets.csv", header=TRUE, as.is=TRUE) | |
names(t) | |
t$timestamp[1:10] | |
### Elininate to the date and time | |
t$timestamp_clean <- str_sub(t$timestamp, 1, -7) | |
t$timestamp_clean[1:10] | |
### Make understandable as a date by R | |
t$tweets_datetime <- format(ymd_hms(t$timestamp_clean, quiet = TRUE), tz="America/New_York",usetz=TRUE) | |
t$tweets_datetime[1:10] | |
t$date <- substring(t$timestamp, 1, 10) | |
t$date[1:10] | |
t$date <- as.POSIXct(t$date,format='%Y-%m-%d', tz="America/New_York") | |
### Plot of tweets over time | |
pdf(file="Tweets_Over_Time.pdf",width=11,height=8.5) | |
ggplot(t, aes(x=date)) + geom_histogram(binwidth = 60*60*24*7, aes(fill = ..count..)) +scale_fill_gradient("Count", low = "skyblue", high = "blue") + xlab("Date") + ggtitle("Miles' Twitter output by week") | |
dev.off() | |
### Create day of the week and hour variables | |
t$day_of_week <- wday(t$tweets_datetime, label = TRUE, abbr = FALSE) | |
t$day_of_week[1:10] | |
t$hour <- hour(t$tweets_datetime) | |
t$hour[1:10] | |
t$am_pm <- ifelse(am(t$tweets_datetime), "AM", "PM") | |
t$am_pm[1:10] | |
### AM VS PM BY DAY OF WEEK | |
pdf(file="Tweets_AMPM_Day.pdf",width=11,height=8.5) | |
counts <- table(t$am_pm, t$day_of_week) | |
barplot(counts, beside=TRUE, xlab="Day of the Week", ylab="Count", | |
main="Tweets by Weekday Seperated by AM vs PM", cex.names=0.8, col=c('lightblue', 'navy')) | |
legend(x=0.5,y=300, rownames(counts), fill=c('lightblue','navy'), bty='n', | |
adj=0.4, cex=1.2) | |
dev.off() | |
### Distribution by Day | |
which(is.na(t$day_of_week)) | |
t <- t[which(!is.na(t$day_of_week)),] | |
pdf(file="Tweets_Hours_Day.pdf",width=11,height=8.5) | |
ggplot(t, aes(hour)) + geom_bar() + facet_wrap(~ day_of_week) + | |
ggtitle("Miles' Tweets by Hour and Day of The Week") + xlab("Hour") + ylab("Count") | |
dev.off() | |
daytime <- table(t$hour, t$day_of_week) | |
dfdaytime <- as.data.frame(daytime) | |
pdf(file="Tweets_Heatmap.pdf",width=11,height=8.5) | |
ggplot(dfdaytime, aes(x=Var2, y=Var1, fill=Freq)) + geom_tile() + | |
scale_fill_gradient(low="white", high="red") + | |
ggtitle("Heatmap of Miles' Tweets") + xlab("Day of week") + ylab ("Hour ofthe Day") | |
dev.off() | |
### WORD CLOURS | |
library(tm) | |
library(wordcloud) | |
# Clean up tweets | |
text <- tolower(t$text) | |
text <- gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", text) | |
## Might not want to get rid of people for wordcloud | |
text <- gsub("@\\w+", "", text) | |
# remove html links | |
text <- gsub("http\\w+", "", text) | |
text <- removeWords(text, stopwords(kind="en")) | |
text <- removeWords(text, c("rt", "milesgrimshaw", "http", "twitter")) | |
text <- removePunctuation(text) | |
text <- removeNumbers(text) | |
## OR | |
# text <- gsub("[[:punct:]]", "", text) | |
# text <- gsub("[[:digit:]]", "", text) | |
text <- stripWhitespace(text) | |
# text <- gsub("[ \t]{2,}", "", text) | |
# text <- gsub("^\\s+|\\s+$", "", text) | |
# To use w/ TM package have to create a corpus | |
# and from that document term matrix (DTM) | |
text_corp <- Corpus(VectorSource(text)) | |
text_dtm <- DocumentTermMatrix(text_corp) | |
inspect(text_dtm[1:2,50:60]) | |
findFreqTerms(text_dtm, 20) | |
word_freq <- colSums(as.array(text_dtm)) | |
word_freq[1:20] | |
# Create wordcloud of top 100 most frequent words | |
pdf(file="Tweets_Wordcloud.pdf",width=11,height=8.5) | |
wordcloud(names(word_freq), word_freq, scale=c(3.2,.2), | |
max.words = 100, random.order=FALSE, | |
colors=c("gray", "darkgreen", "darkblue"), rot.per=.3) | |
dev.off() | |
############## FACEBOOK ################## | |
### FUNCTIONS | |
get_day <- function (x) { | |
if (grepl('Monday', x)) return (1) | |
else if (grepl('Tuesday', x)) return (2) | |
else if (grepl('Wednesday', x)) return (3) | |
else if (grepl('Thursday', x)) return (4) | |
else if (grepl('Friday', x)) return (5) | |
else if (grepl('Saturday', x)) return (6) | |
else if (grepl('Sunday', x)) return (7) | |
else return (NA) | |
} | |
get_time <- function (x) { | |
n <- nchar(x) | |
time <- substring(x,(n-11),n) | |
time <- gsub("</td>", "", time) | |
return (time) | |
} | |
## Read in the html | |
f <- scan("./Data/Facebook/html/account_activity.html", what="", sep="\n") | |
f2 <- unlist(strsplit(f, '<td>', fixed = TRUE)) | |
f2 <- f2[-1] | |
head(f2) | |
# The dates are every 5th column | |
f2[c(1,6,11,16,21)] | |
l <- length(f2) | |
data <- f2[seq(1, length(f2), 5)] | |
which(is.na(data)) | |
l <- length(data) | |
data[1:30] | |
## Manually create factors for 'today' and 'yesterday' | |
day1 <- rep(0,13) | |
## 1 = Monday | |
## 7 = Sunday | |
## The first 9 days say 'Today' which is Thursday | |
day1[1:9] <- rep(4) | |
day1[10:13] <- rep(3) | |
data_subset <- data[14:l] | |
data_subset[1:10] | |
l_sub <- length(data_subset) | |
day2 <- rep(NA,l_sub) | |
which(is.na(data_subset)) | |
## Get the dates | |
day2 <- lapply(data_subset, get_day) | |
day2 <- unlist(day2) | |
## Bind the manually created dates and stripped dates together | |
day <- c(day1,day2) | |
count <- table(day) | |
barplot(count) | |
## Get the time | |
# l is length of original data | |
time <- rep(NA,l) | |
time <- lapply(data, get_time) | |
time <- unlist(time) | |
## Convert AM/PM to 24hr understandable by R | |
time <- strptime(time, "%I:%M%p", tz="America/New_York") | |
## Create a data frame with the days and times aligned | |
ff <- data.frame(day, time) | |
head(ff) | |
## Create factors and rename the factor levels for the day of the week | |
ff$day2 <- factor(ff$day) | |
levels(ff$day2) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday") | |
## Strip the time down to the hour | |
ff$hour <- hour(ff$time) | |
ff$hour[1:10] | |
### Distribution by Day | |
pdf(file="Facebook_Day_Time.pdf",width=11,height=8.5) | |
ggplot(ff, aes(hour)) + geom_bar() + facet_wrap(~ day2) + | |
ggtitle("Miles' Facebook Usage by Hour and Day of The Week") + | |
xlab("Hour of the Day") + ylab("Count") | |
dev.off() | |
## Create a new data frame so can make the heatmap | |
daytime <- table(ff$hour, ff$day2) | |
dfdaytime <- as.data.frame(daytime) | |
pdf(file="Facebook_Heatmap.pdf",width=11,height=8.5) | |
ggplot(dfdaytime, aes(x=Var2, y=Var1, fill=Freq)) + geom_tile() + | |
scale_fill_gradient(low="white", high="red") + | |
ggtitle("Heatmap of Miles' Facebook Usage") + xlab("Day of week") + ylab ("Hour of day") | |
dev.off() | |
############## OPEN PATHS ################## | |
library(maps) | |
library(geosphere) | |
## Read in the data | |
l <- read.csv("./Data/openpaths.csv", header=TRUE, as.is=TRUE) | |
head(l) | |
## Only select cases where I actually moved | |
l2 <- l[which(l$alt!=0),] | |
head(l2) | |
which(l2$alt>100) | |
test <- l2[which(l2$alt>100),] | |
names(test) | |
which(is.na(test$lon)) | |
## Get the states I want to map | |
states <- c('new york', 'connecticut', 'massachusetts', 'new hampshire', 'vermont') | |
## Map just the specific states | |
pdf(file="Paths_States.pdf",width=11,height=8.5) | |
map('county',states, col="#f2f2f2", fill=TRUE, bg="white", lwd=0.01) | |
## Map the country | |
#pdf(file="Paths_USA.pdf",width=11,height=8.5) | |
#map('state',col="#f2f2f2", fill=TRUE, bg="white", lwd=0.01) | |
for (i in 1:(nrow(l)-1)) { | |
lat_1 <- test$lat[i] | |
lon_1 <- test$lon[i] | |
lat_2 <- test$lat[i+1] | |
lon_2 <- test$lon[i+1] | |
if (lat_1 != lat_2 & lon_1 != lon_2) { | |
inter <- gcIntermediate(c(lon_1, lat_1), c(lon_2, lat_2), n=10, addStartEnd=TRUE) | |
lines(inter, col="red", lwd=0.6) | |
} | |
} | |
#title("Miles' Movements March 2012 - May 2013") | |
title("Miles' North East Movements March 2012 - May 2013") | |
dev.off() | |
#### SOURCES & HAT TIPS ### | |
http://www.scienceisdelicious.net/?p=407 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment