Skip to content

Instantly share code, notes, and snippets.

@RHDZMOTA
Last active April 1, 2016 04:29
Show Gist options
  • Save RHDZMOTA/175cdb06efb06ba0760b62a9a00fba6d to your computer and use it in GitHub Desktop.
Save RHDZMOTA/175cdb06efb06ba0760b62a9a00fba6d to your computer and use it in GitHub Desktop.
This code generates visualizations of whatsapp's logs analysis. Two things are needed in the working directory: whatsapp's log file (.txt without media) and whatsapp_functions.R
# Whatsapp's data visualization
# by: Rodrigo Hernández Mota
# file name: dataviz_whatsapp.R
# V.1.0.0
# General structure
# This file was structured as data exploration process
# was developing. For such reason, the main sections
# contained and the visualizations are divided in:
# - word frequency
# - usual words
# - unusual words
# - word's lenght (underdeveloped)
# - messaging per day
# - word distribution in a day (underdeveloped)
library(ggplot2)
source("functions_whatsapp.R")
available_data <- c("160322_chat_mufasa.txt",
"160322_atrh.txt",
"160324_chat_familia.txt",
"160325_chat_ds.txt")
dataset <- clean_data(available_data[4])
# choose user (for a future viz)
unique_us <- unique(dataset$observations$Users)
aux_df <- data_frame(User = unique_us[order(unique_us)])
aux_df
select_user <- aux_df$User[2]
order_users <- c(as.character(aux_df[aux_df$User != select_user, ]),
as.character(select_user))
# Word frequency ----------------------------------------------------------
# Word frequency for the first 50%
select <- dataset$word_count[dataset$word_count$Acum < 50, ]
ggplot(select,aes(y = Rel, x = 1:(dim(select)[1]))) + theme_light() +
geom_bar(stat = "identity",
alpha = 0.3, fill = "blue", color = "black") +
ggtitle("Word frequency ranking (freq.acum 50%)") +
xlab("Ranking") + ylab("Frequency (%)")
# Word frequency for the first 25%
select <- dataset$word_count[dataset$word_count$Acum < 25, ]
ggplot(select,aes(y = Rel, x = Words)) + theme_light() +
scale_x_discrete(limits = select$Words) +
geom_bar(stat = "identity",
alpha = 0.3, fill = "dark red", color = "black") +
ggtitle("Word frequency ranking (freq.acum 25%)") +
xlab("Ranking") + ylab("Frequency (%)")
# Usual Words -------------------------------------------------------------
# filter the common words
select <- word_filter(dataset$word_count)
ggplot(select[1:25, ]) +
geom_bar(stat = "identity", aes(y = Rel, x = Words),
alpha = 0.7, fill = "dark red", color = "black") +
scale_x_discrete(limits = select[1:25, ]$Words) +
theme_light() +
ggtitle("Most used 25 words (filtered)") +
xlab(" Word") + ylab("Frequency (%)")
# Unusual Words -----------------------------------------------------------
# Words frequenct, last 1%
select <- dataset$word_count[dataset$word_count$Acum > 99.9, ]
ggplot(select) + theme_light() +
geom_bar(stat = "identity", aes(y = Rel, x = Words),
alpha = 0.7, fill = "dark gray", color = "black") +
scale_x_discrete(limits = select$Words) +
ggtitle("Less used words (1% of acummulated use)") +
xlab(" Palabra") + ylab("Frecuencia Relativa (%)")
# Unique words - 1 single use
select <- dataset$word_count[dataset$word_count$Count == 1, ]
select <- as.data.frame(select)
tot_unw <- nrow(select)
rand_index <- floor(runif(20)*(tot_unw-1)) + 1
rand_unique_words <- select[rand_index, "Words"]
beginning <- sapply(rand_unique_words, strsplit, "")
len <- sapply(beginning, length)
len2 <- 0; beg <- numeric(); j <- 1
for(i in len){
beg[j] <- unlist(beginning)[len2 + 1]
len2 <- i + len2
j <- j + 1
}
select <- data_frame(Words = rand_unique_words,
Beginning = beg,
Length = len)
ggplot(select, aes(Beginning, Length,label = Words)) +
geom_point() + geom_text(hjust = 0, size = 5,
nudge_x = 0.05, color = "dark blue") +
theme_bw() + theme(
legend.background = element_rect(fill = "white"),
panel.grid.major = element_line(colour = "grey"),
panel.grid.minor = element_blank()) +
#geom_label(fontface = "bold") +
ggtitle("20 random words used just one time") +
xlab("Initial letter") + ylab("Word's length")
# Word length -------------------------------------------------------------
# not meaningful
ggplot(dataset$word_count, aes(Word_lenght, Count)) +
geom_point(aes(size = id_nl), alpha = 0.3, color = "dark red") +
theme_light() + xlim(0, 15) +
theme(legend.position="none") +
ggtitle("Frequency and length") +
xlab("Length") + ylab("Frequency") +
annotate("text", x = 3*max(dataset$word_count$Word_lenght)/4,
y = max(dataset$word_count$Count)*0.8,
label = "El área de los puntos es proporcional
al número de palabras con la misma
longitud y frecuencia", size = 3)
# kind of meaningful
ggplot(dataset$word_count, aes(x = Word_lenght, y = ..density..)) +
geom_density(fill = "blue", alpha = 0.3) + xlim(0, 15) + theme_light() +
ggtitle("Estimated density") +
xlab("Length of the word") + ylab("Density")
# Messaging per day -------------------------------------------------------
# Number of words messaged by day
ggplot(dataset$words_users, aes(x = Date, y = Words, color = User)) +
geom_point() +
theme_light() +
guides(color = guide_legend(ncol=1)) +
ggtitle("Number of words send by user") +
xlab("Day") +
ylab("Number")
# Distribution of number of words send per day
unique_us <- unique(dataset$observations$Users)
if(length(unique_us) > 2){
ggplot(NULL) +
geom_density(data = dataset$words_users[
dataset$words_users$User != select_user, ],
aes(x = Words, y = ..density.., colour = User)) +
guides(fill = guide_legend(ncol=2)) +
geom_density(data = dataset$words_users[
dataset$words_users$User == select_user, ],
aes(x = Words, y = ..density.., fill = User), alpha = 0.35) +
scale_fill_manual(name = "Selected",values = "blue") +
guides(fill = guide_legend(ncol=2)) +
theme_light() +
xlim(0, max(dataset$words_users$Words)/4) +
ggtitle("Number of words send by day") +
xlab("Quantity") +
ylab("Density")
} else {
ggplot(dataset$words_users, aes(x = Words, y = ..density.., fill = User)) +
geom_density(alpha = 0.2) +
theme_light() +
xlim(0, max(dataset$words_users$Words)/4) +
ggtitle("Number of words send by day") +
xlab("Quantity") +
ylab("Density") +
guides(fill = guide_legend(ncol=1))
}
ggplot(dataset$words_users, aes(x = Words, y = ..density..)) +
geom_density(alpha = 0.1, fill = "gray") +
facet_wrap(~ User, scale = "free") +
theme_light() +
ggtitle("Words send by day") +
xlab("Quantity") +
ylab("Density")
# Number of words distributed in a day ------------------------------------
# Still under development.
ggplot(dataset$words_hour, aes(x=Hour,y = ..density..)) +
geom_density(alpha = 0.3, fill = "blue") +
theme_light() +
ggtitle("Words send according to the hour") +
xlab("Time of the day") +
ylab("Density")
if(length(unique_us) > 2){
# choose user
ggplot(NULL) +
geom_density(data = dataset$words_hour[dataset$words_hour$Users != select_user, ],
aes(x = Hour, y = ..density.., colour = Users)) +
guides(fill = guide_legend(ncol=2)) +
geom_density(data = dataset$words_hour[dataset$words_hour$Users == select_user, ],
aes(x = Hour, y = ..density.., fill = Users), alpha = 0.35) +
scale_fill_manual(name = "Selected",values = "blue") +
guides(fill = guide_legend(ncol=2)) +
theme_light() +
#xlim(0, x_lim[2]) +
ggtitle("Función de densidad: frecuencia de palabras enviadas por usuario según la hora") +
xlab("Hora del día") +
ylab("Frecuencia relativa")
} else {
ggplot(dataset$words_hour, aes(x=Hour,y = ..density.., fill = Users)) +
geom_density(alpha = 0.1) +
theme_light() +
ggtitle("Función de densidad: frecuencia de palabras enviadas según la hora por usuario") +
xlab("Hora del día") +
ylab("Frecuencia relativa")
}
ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
geom_bar(stat = "identity", width = 1, color = "black") + #scale_fill_brewer() +
coord_polar()
ggplot(dataset$freq_hour, aes(x = Hours, y = Words, color = User)) + theme_minimal() +
geom_point() + #scale_fill_brewer() +
coord_polar()
ggplot(dataset$freq_hour, aes(x = factor(1), y = Words, fill = User)) +
geom_bar(stat = "identity") + coord_polar()
ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
geom_bar(stat = "identity", position = "fill") + #scale_fill_brewer() +
coord_polar()
ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
geom_bar(stat = "identity", position = "stack") + #scale_fill_brewer() +
coord_polar()
ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
geom_bar(stat = "identity", position = "dodge") +
coord_polar()
ggplot(dataset$words_hour, aes(x = Hour, y = ..density.., fill = Users)) +
geom_density(alpha = 0.1) + coord_polar()
ggplot(dataset$words_hour, aes(x = Hour, y = ..density..)) +
geom_density(alpha = 0.1, fill = "gray") +
facet_wrap(~ Users) +
theme_light() +
ggtitle("Prob") +
xlab("Time of the day") +
ylab("Density")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment