Last active
April 1, 2016 04:29
-
-
Save RHDZMOTA/175cdb06efb06ba0760b62a9a00fba6d to your computer and use it in GitHub Desktop.
This code generates visualizations of whatsapp's logs analysis. Two things are needed in the working directory: whatsapp's log file (.txt without media) and whatsapp_functions.R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Whatsapp's data visualization | |
# by: Rodrigo Hernández Mota | |
# file name: dataviz_whatsapp.R | |
# V.1.0.0 | |
# General structure | |
# This file was structured as data exploration process | |
# was developing. For such reason, the main sections | |
# contained and the visualizations are divided in: | |
# - word frequency | |
# - usual words | |
# - unusual words | |
# - word's lenght (underdeveloped) | |
# - messaging per day | |
# - word distribution in a day (underdeveloped) | |
library(ggplot2) | |
source("functions_whatsapp.R") | |
available_data <- c("160322_chat_mufasa.txt", | |
"160322_atrh.txt", | |
"160324_chat_familia.txt", | |
"160325_chat_ds.txt") | |
dataset <- clean_data(available_data[4]) | |
# choose user (for a future viz) | |
unique_us <- unique(dataset$observations$Users) | |
aux_df <- data_frame(User = unique_us[order(unique_us)]) | |
aux_df | |
select_user <- aux_df$User[2] | |
order_users <- c(as.character(aux_df[aux_df$User != select_user, ]), | |
as.character(select_user)) | |
# Word frequency ---------------------------------------------------------- | |
# Word frequency for the first 50% | |
select <- dataset$word_count[dataset$word_count$Acum < 50, ] | |
ggplot(select,aes(y = Rel, x = 1:(dim(select)[1]))) + theme_light() + | |
geom_bar(stat = "identity", | |
alpha = 0.3, fill = "blue", color = "black") + | |
ggtitle("Word frequency ranking (freq.acum 50%)") + | |
xlab("Ranking") + ylab("Frequency (%)") | |
# Word frequency for the first 25% | |
select <- dataset$word_count[dataset$word_count$Acum < 25, ] | |
ggplot(select,aes(y = Rel, x = Words)) + theme_light() + | |
scale_x_discrete(limits = select$Words) + | |
geom_bar(stat = "identity", | |
alpha = 0.3, fill = "dark red", color = "black") + | |
ggtitle("Word frequency ranking (freq.acum 25%)") + | |
xlab("Ranking") + ylab("Frequency (%)") | |
# Usual Words ------------------------------------------------------------- | |
# filter the common words | |
select <- word_filter(dataset$word_count) | |
ggplot(select[1:25, ]) + | |
geom_bar(stat = "identity", aes(y = Rel, x = Words), | |
alpha = 0.7, fill = "dark red", color = "black") + | |
scale_x_discrete(limits = select[1:25, ]$Words) + | |
theme_light() + | |
ggtitle("Most used 25 words (filtered)") + | |
xlab(" Word") + ylab("Frequency (%)") | |
# Unusual Words ----------------------------------------------------------- | |
# Words frequenct, last 1% | |
select <- dataset$word_count[dataset$word_count$Acum > 99.9, ] | |
ggplot(select) + theme_light() + | |
geom_bar(stat = "identity", aes(y = Rel, x = Words), | |
alpha = 0.7, fill = "dark gray", color = "black") + | |
scale_x_discrete(limits = select$Words) + | |
ggtitle("Less used words (1% of acummulated use)") + | |
xlab(" Palabra") + ylab("Frecuencia Relativa (%)") | |
# Unique words - 1 single use | |
select <- dataset$word_count[dataset$word_count$Count == 1, ] | |
select <- as.data.frame(select) | |
tot_unw <- nrow(select) | |
rand_index <- floor(runif(20)*(tot_unw-1)) + 1 | |
rand_unique_words <- select[rand_index, "Words"] | |
beginning <- sapply(rand_unique_words, strsplit, "") | |
len <- sapply(beginning, length) | |
len2 <- 0; beg <- numeric(); j <- 1 | |
for(i in len){ | |
beg[j] <- unlist(beginning)[len2 + 1] | |
len2 <- i + len2 | |
j <- j + 1 | |
} | |
select <- data_frame(Words = rand_unique_words, | |
Beginning = beg, | |
Length = len) | |
ggplot(select, aes(Beginning, Length,label = Words)) + | |
geom_point() + geom_text(hjust = 0, size = 5, | |
nudge_x = 0.05, color = "dark blue") + | |
theme_bw() + theme( | |
legend.background = element_rect(fill = "white"), | |
panel.grid.major = element_line(colour = "grey"), | |
panel.grid.minor = element_blank()) + | |
#geom_label(fontface = "bold") + | |
ggtitle("20 random words used just one time") + | |
xlab("Initial letter") + ylab("Word's length") | |
# Word length ------------------------------------------------------------- | |
# not meaningful | |
ggplot(dataset$word_count, aes(Word_lenght, Count)) + | |
geom_point(aes(size = id_nl), alpha = 0.3, color = "dark red") + | |
theme_light() + xlim(0, 15) + | |
theme(legend.position="none") + | |
ggtitle("Frequency and length") + | |
xlab("Length") + ylab("Frequency") + | |
annotate("text", x = 3*max(dataset$word_count$Word_lenght)/4, | |
y = max(dataset$word_count$Count)*0.8, | |
label = "El área de los puntos es proporcional | |
al número de palabras con la misma | |
longitud y frecuencia", size = 3) | |
# kind of meaningful | |
ggplot(dataset$word_count, aes(x = Word_lenght, y = ..density..)) + | |
geom_density(fill = "blue", alpha = 0.3) + xlim(0, 15) + theme_light() + | |
ggtitle("Estimated density") + | |
xlab("Length of the word") + ylab("Density") | |
# Messaging per day ------------------------------------------------------- | |
# Number of words messaged by day | |
ggplot(dataset$words_users, aes(x = Date, y = Words, color = User)) + | |
geom_point() + | |
theme_light() + | |
guides(color = guide_legend(ncol=1)) + | |
ggtitle("Number of words send by user") + | |
xlab("Day") + | |
ylab("Number") | |
# Distribution of number of words send per day | |
unique_us <- unique(dataset$observations$Users) | |
if(length(unique_us) > 2){ | |
ggplot(NULL) + | |
geom_density(data = dataset$words_users[ | |
dataset$words_users$User != select_user, ], | |
aes(x = Words, y = ..density.., colour = User)) + | |
guides(fill = guide_legend(ncol=2)) + | |
geom_density(data = dataset$words_users[ | |
dataset$words_users$User == select_user, ], | |
aes(x = Words, y = ..density.., fill = User), alpha = 0.35) + | |
scale_fill_manual(name = "Selected",values = "blue") + | |
guides(fill = guide_legend(ncol=2)) + | |
theme_light() + | |
xlim(0, max(dataset$words_users$Words)/4) + | |
ggtitle("Number of words send by day") + | |
xlab("Quantity") + | |
ylab("Density") | |
} else { | |
ggplot(dataset$words_users, aes(x = Words, y = ..density.., fill = User)) + | |
geom_density(alpha = 0.2) + | |
theme_light() + | |
xlim(0, max(dataset$words_users$Words)/4) + | |
ggtitle("Number of words send by day") + | |
xlab("Quantity") + | |
ylab("Density") + | |
guides(fill = guide_legend(ncol=1)) | |
} | |
ggplot(dataset$words_users, aes(x = Words, y = ..density..)) + | |
geom_density(alpha = 0.1, fill = "gray") + | |
facet_wrap(~ User, scale = "free") + | |
theme_light() + | |
ggtitle("Words send by day") + | |
xlab("Quantity") + | |
ylab("Density") | |
# Number of words distributed in a day ------------------------------------ | |
# Still under development. | |
ggplot(dataset$words_hour, aes(x=Hour,y = ..density..)) + | |
geom_density(alpha = 0.3, fill = "blue") + | |
theme_light() + | |
ggtitle("Words send according to the hour") + | |
xlab("Time of the day") + | |
ylab("Density") | |
if(length(unique_us) > 2){ | |
# choose user | |
ggplot(NULL) + | |
geom_density(data = dataset$words_hour[dataset$words_hour$Users != select_user, ], | |
aes(x = Hour, y = ..density.., colour = Users)) + | |
guides(fill = guide_legend(ncol=2)) + | |
geom_density(data = dataset$words_hour[dataset$words_hour$Users == select_user, ], | |
aes(x = Hour, y = ..density.., fill = Users), alpha = 0.35) + | |
scale_fill_manual(name = "Selected",values = "blue") + | |
guides(fill = guide_legend(ncol=2)) + | |
theme_light() + | |
#xlim(0, x_lim[2]) + | |
ggtitle("Función de densidad: frecuencia de palabras enviadas por usuario según la hora") + | |
xlab("Hora del día") + | |
ylab("Frecuencia relativa") | |
} else { | |
ggplot(dataset$words_hour, aes(x=Hour,y = ..density.., fill = Users)) + | |
geom_density(alpha = 0.1) + | |
theme_light() + | |
ggtitle("Función de densidad: frecuencia de palabras enviadas según la hora por usuario") + | |
xlab("Hora del día") + | |
ylab("Frecuencia relativa") | |
} | |
ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() + | |
geom_bar(stat = "identity", width = 1, color = "black") + #scale_fill_brewer() + | |
coord_polar() | |
ggplot(dataset$freq_hour, aes(x = Hours, y = Words, color = User)) + theme_minimal() + | |
geom_point() + #scale_fill_brewer() + | |
coord_polar() | |
ggplot(dataset$freq_hour, aes(x = factor(1), y = Words, fill = User)) + | |
geom_bar(stat = "identity") + coord_polar() | |
ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() + | |
geom_bar(stat = "identity", position = "fill") + #scale_fill_brewer() + | |
coord_polar() | |
ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() + | |
geom_bar(stat = "identity", position = "stack") + #scale_fill_brewer() + | |
coord_polar() | |
ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() + | |
geom_bar(stat = "identity", position = "dodge") + | |
coord_polar() | |
ggplot(dataset$words_hour, aes(x = Hour, y = ..density.., fill = Users)) + | |
geom_density(alpha = 0.1) + coord_polar() | |
ggplot(dataset$words_hour, aes(x = Hour, y = ..density..)) + | |
geom_density(alpha = 0.1, fill = "gray") + | |
facet_wrap(~ Users) + | |
theme_light() + | |
ggtitle("Prob") + | |
xlab("Time of the day") + | |
ylab("Density") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment