Last active
February 13, 2023 10:12
-
-
Save RHDZMOTA/c7758624bb35bc12976c07349d714cf3 to your computer and use it in GitHub Desktop.
Functions used in file: whatsapp_dataviz.R. Mainly data cleaning.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Whatsapp's data visualization functions and data cleaning | |
# by: Rodrigo Hernández Mota | |
# file name: functions_whatsapp.R | |
# V.1.0.0 | |
library(tibble) | |
library(tidyr) | |
# palette_colors ---------------------------------------------------------- | |
# This function returns a vector of colours (character- HEX number) | |
# Variables: | |
# x: character that should specify the type of palette. | |
# y: numeric length 1 that indicates the length of the vector of | |
# colors desired. | |
# select: the position of the different color in "usuario". | |
palette_colors <- function(x, y = 3, select = 1){ | |
# check of variable type | |
if (is.character(x) != 1) | |
stop('x parameter should be a character.') | |
# available palettes | |
if(x == "facebook") | |
pal <- c("#3b5998", " #8b9dc3", "#dfe3ee") | |
if(x == "google") | |
pal <- c("#008744", "#0057e7", "#d62d20", "#ffa700") | |
if(x == "griff") | |
pal <- c("#740001", "#ae0001", "#eeba30", "#d3a625", "#000000") | |
if(x == "pastel") | |
pal <- c("#1b85b8", "#5a5255", "#559e83", "#ae5a41", "#c3cb71") | |
if(x == "usuario"){ | |
pal <- rep("dark gray",y); pal[select] <- "dark red" } | |
if(x == "todos") | |
pal <- c("#3b5998", "#d62d20", "#008744", "#0057e7", "#ffa700", | |
"#740001", "#ae0001", " #eeba30", "#d3a625", "#8b9dc3", | |
"#dfe3ee") | |
# check of length | |
if(length(pal) < y) | |
stop('y is grater than the length of the vector of color desired') | |
return(pal[1:y]) | |
} | |
# count_length ------------------------------------------------------------ | |
# This function counts the number of letters in a word | |
# Variables: | |
# word: character, i.e. a word. | |
count_length <- function(word){ | |
# check of variable type | |
if (is.character(word) != 1) | |
stop('word parameter should be a character.') | |
else | |
length(unlist(strsplit(word, ""))) | |
} | |
# nl --------------------------------------------------------------------- | |
# Auxiliar function. Paste the elements of a vector of length 2. | |
# Variables: | |
# vect: vector of length 2 | |
nl <- function(vect){ | |
if (is.vector(vect)) | |
if (length(vect) == 2) | |
paste(vect[1], vect[2], sep = "") | |
} | |
# other_stuff ------------------------------------------------------------- | |
# Given the filter of variable 1 from a dataset, obtain variable 2 | |
# as "all what is not variable 1". | |
# Variables: | |
# | |
other_stuff <- function(raw_data, var1_index){ | |
other_index <- c(1:length(raw_data)) * (var1_index == F) | |
other_index <- other_index[other_index != 0] | |
other_value <- numeric() | |
other_value[1] <- raw_data[other_index[1]] | |
j <- other_index[1]; k <- 1; tx <- "" | |
for(i in other_index[2:length(other_index)]){ | |
if(i - j == 1) | |
other_value[k] <- paste(other_value[k], raw_data[i]) | |
else{ | |
k <- k + 1 | |
other_value[k] <- raw_data[i] | |
} | |
j <- i | |
} | |
return(other_value) | |
} | |
# clean_data -------------------------------------------------------------- | |
# Function for cleaning data of a .txt file from Whatsapp | |
# Variables: | |
# dataset_txt: available .txt files from whatsapp in directory | |
# choice: position of the file of interest | |
clean_data <- function(dataset_txt, specif = F){ | |
# read whatsapp .txt data | |
raw_data <- read.table(dataset_txt, header = F, sep = "\t", | |
stringsAsFactors = F, quote = "") | |
raw_data <- apply(raw_data, 1, as.character) | |
raw_data <- unlist(lapply(raw_data, strsplit, split = "\n")) | |
# So now we have a vector "character" with each message by time. | |
# general structure: <date>, <time> - <name>: <text> | |
# possible variations: <data>, <time> - <name> <action> | |
# possible variations: <text> *cames from prev. message | |
# Separate by comma, as an attempt to take out the date. | |
raw_data <- lapply(raw_data, strsplit, split = ",") | |
# unlist to work with a vector. | |
raw_data <- unlist(raw_data) | |
# identify the position of dates | |
date_index <- sapply(X = raw_data, | |
FUN = function(y){ | |
z <- sum( | |
c( | |
is( | |
tryCatch(as.Date(y), error = function(e) e), | |
"error" | |
), | |
is.na(y) | |
) | |
) == 0 | |
return(z) | |
}) | |
# get value of dates and other in a vector. | |
# because of the fact that the text of the messages was | |
# also separated by comma, we will have to use a for loop | |
# (this is all done in the function). | |
date_value <- raw_data[date_index] | |
other <- other_stuff(raw_data, date_index) | |
date <- as.Date(date_value) | |
# Check-up | |
if(length(date) != length(other)) | |
stop("Problem between _dates_ and _other_: see function") | |
aux_df <- data_frame(Date = date, Other = other) | |
unique_days <- unique(aux_df$Date) | |
# If the data you are using is too large, the code will | |
# be very slow. For this reason, it is decided to make | |
# a random sample of the data for those cases in which | |
# we are dealing with a large dataset. | |
# Samples are going to be taken according to the days. | |
n <- length(unique_days) | |
if(n > 120 | sqrt(nrow(aux_df)) > 120){ | |
sample_size <- floor(sqrt(n) + n / 10) | |
sample_index <- floor((n-1) * runif(sample_size) + 1) | |
sample_dates <- unique_days[sample_index] | |
sample_data <- data_frame() | |
for(i in sample_dates){ | |
sample_data <- rbind(sample_data, aux_df[aux_df$Date == i, ]) | |
} | |
date <- sample_data$Date | |
other <- as.character(sample_data$Other) | |
} | |
# By this time the vector "other" might look like | |
# general structure: <time> - <name>: <text> | |
# possible variations: <time> - <name> <action> | |
# separate by dash, an attempt to take out the time. | |
otherl <- unlist(lapply(other,strsplit, split = "-")) | |
time_index <- sapply(otherl, function(x){ | |
a <- strsplit(gsub(" ", "", x), ":")[[1]] | |
if(length(a) == 2){ | |
sum(is.na(suppressWarnings(as.numeric(a)))) == 0 | |
} else { FALSE }}) | |
time_value <- otherl[time_index] | |
# Check up | |
if(length(date) != length(time_value)) | |
stop("Problem between _dates_ and _time_: see function") | |
# Join time and date and create variable with "other" data. | |
general_time <- apply(cbind(as.character(date), time_value), 1, nl) | |
time <- strptime(general_time, paste("%Y-%m-%d"," %H:%M ", sep = "")) | |
other <- other_stuff(otherl, time_index) | |
# Check up | |
if(length(time) != length(other)) | |
stop("Problem between _time_ and _other_: see function") | |
# Now the vector "other" might look like | |
# general structure: <name>: <text> | |
# possible variations: <name> <action> | |
# Spearate by : an attempt to take out the user. | |
otherl <- lapply(other,strsplit, split = ":") | |
# TO DO: re-wirte this code avoiding for loops. | |
other <- numeric() | |
user <- numeric() | |
for(i in 1:length(otherl)){ | |
n <- length(otherl[[i]][[1]]) | |
user[i] <- gsub(" ","",otherl[[i]][[1]][1]) | |
for(j in 2:n){ | |
if(j == 2) | |
tex <- otherl[i][[1]][[1]][2] | |
else | |
tex <- paste(tex, otherl[i][[1]][[1]][j], sep = ":") | |
} | |
other[i] <- tex | |
} | |
# Check up | |
if(length(time) != length(other)) | |
stop("Problem between _time_ and _other_ (name): see function") | |
# We have the following variables: | |
# date, time, user, other. | |
# The "other" variable represents the message send. | |
# Separate by " " an attempt to word identification. | |
# Challenges: empty spaces, punctuation symbols, NAs. | |
otherl <- lapply(other, strsplit, split = " ") | |
# TO DO: re-wirte this code avoiding for loops. | |
m <- 1 | |
n_max <- 0 | |
words <- list() | |
pos <- numeric() | |
for(i in 1:length(otherl)){ | |
n <- length(otherl[[i]][[1]]) | |
w <- numeric() | |
k <- 1 | |
for(j in 1:length(otherl[[i]][[1]])){ | |
if(otherl[[i]][[1]][j] != "" & otherl[[i]][[1]][j] != " "){ | |
w[k] <- gsub(pattern = '[[:punct:]]', | |
replacement = "", | |
otherl[[i]][[1]][j]) | |
k <- k + 1 | |
} | |
} | |
l <- 1 | |
wf <- numeric() | |
for(j in 1:length(w)){ | |
if(length(w) != 0){ | |
if(w[j] != ""){ | |
wf[l] <- w[j] | |
l <- l + 1 | |
} | |
} | |
} | |
if(l != 1){ | |
pos[m] <- i | |
words[[m]] <- wf | |
n_act <- length(words[[m]]) | |
if(n_act > n_max) | |
n_max <- n_act | |
m <- m + 1 | |
} | |
} | |
# TO DO: re-wirte this code avoiding for loops. | |
by_words <- numeric() | |
gen_word <- numeric() | |
for(i in 1:length(words)){ | |
n <- length(words[[i]]) | |
v <- c(words[[i]], rep(NA,n_max + 1 - n)) | |
gen_word <- c(gen_word, words[[i]]) | |
if( i == 1) | |
by_words <- v | |
else | |
by_words <- rbind(by_words, v) | |
} | |
# Now we create a dataframe for the words per message send | |
names(by_words) <- as.character(1:ncol(by_words)) | |
rownames(by_words) <- as.character(1:nrow(by_words)) | |
by_words <- as_data_frame(by_words) | |
# here we unify all the variables into a single data_frame | |
obs <- data_frame(Date = date, Time = as.POSIXct(time), Users = user) | |
obs <- obs[pos, ] | |
obs <- as_data_frame(cbind(obs, by_words)) | |
# determine the frequency of some words | |
unique_words <- unique(gen_word) | |
cont_words <- numeric() | |
aux_df <- data_frame(Words = gen_word) | |
for(i in 1:length(unique_words)){ | |
cont_words[i] <- dim(aux_df[aux_df$Words == unique_words[i], ])[1] | |
} | |
word_count <- data_frame(Words = unique_words, | |
Count = as.numeric(cont_words)) | |
# Change the name of "media omitted" to <media> | |
# Add relative and comulative frequency | |
new_row <- c("<media>", as.character( | |
word_count[word_count$Words == "omitted", "Count"])) | |
word_count <- rbind(word_count, new_row) | |
word_count$Count <- as.numeric(word_count$Count) | |
word_count <- word_count[order(word_count$Count,decreasing = T), ] | |
word_count <- word_count[word_count$Words != "Media" & word_count$Words != "omitted",] | |
word_count$Rel <- 100 * word_count$Count / sum(word_count$Count) | |
Acum <- numeric() | |
Acum[1] <- as.numeric(word_count$Rel[1]) | |
for(i in 2:length(word_count$Rel)){ | |
Acum[i] <- Acum[i-1] + word_count$Rel[i] | |
} | |
word_count$Acum <- Acum | |
# Fix some bugs in users | |
unique_us <- as.character(unique(obs$Users)) | |
# Start the clock | |
#ptm <- proc.time() | |
aux_df <- data_frame() | |
for(i in unique_us){ | |
filr <- as.character(i) | |
filr <- paste(filr,"txt_filtr", sep = "") | |
filr <- strsplit(filr, "added") | |
if(length(filr[[1]]) == 1) | |
filr <- strsplit(filr[[1]], "changedthis") | |
if(length(filr[[1]]) == 1) | |
filr <- strsplit(filr[[1]], "left") | |
if(length(filr[[1]]) == 1) | |
filr <- strsplit(filr[[1]], "-NA") | |
if(length(filr[[1]]) == 1) | |
filr <- strsplit(filr[[1]], "createdg") | |
if(length(filr[[1]]) == 1){ | |
aux_df <- rbind(aux_df, | |
obs[obs$Users == i, ]) | |
} | |
} | |
obs <- aux_df | |
unique_us <- as.character(unique(obs$Users)) | |
# Stop the clock | |
#proc.time() - ptm | |
# remove na's | |
avoid_index <- is.na(obs$Date) == F | |
obs <- obs[avoid_index, ] | |
# DONT'T | |
obs <- specific_func(specif, unique_us, obs) | |
# Determine number of words and messages per day | |
k <- 1 | |
dates <- unique(obs$Date) | |
messages_users <- data_frame() | |
words_users <- data_frame() | |
for(i in dates){ | |
for(j in unique_us){ | |
messages_users[k, j] <- dim(obs[obs$Users == j & obs$Date == i, "Users"])[1] | |
words_users[k, j] <- sum(is.na(obs[obs$Users == j & obs$Date == i, ]) == F) - 3 | |
} | |
k <- k + 1 | |
} | |
messages_users$Date <- dates | |
messages_users <- gather(messages_users, User, Messages, -Date) | |
words_users$Date <- dates | |
words_users <- gather(words_users, User, Words, -Date) | |
# Word and word's length | |
word_count$Word_lenght <- sapply(word_count$Words,count_length) | |
word_count$id_nl <- as.factor(apply( | |
word_count[,c("Count", "Word_lenght")], 1, nl)) | |
order_id <- word_count[order(word_count$Count), "id_nl"] | |
levels(word_count$id_nl) <- order_id | |
# Words and hour | |
tidy_obs <- na.omit(gather(obs, place_on, Word, | |
-Date, -Time, -Users)) | |
tidy_obs <- na.omit(tidy_obs) | |
hours_vector <- sapply(tidy_obs$Time, FUN = function(x) | |
as.numeric(format(strptime(x, "%Y-%m-%d %H:%M:%S"), "%H"))) | |
tidy_obs$Hour <- hours_vector | |
words_hour <- tidy_obs[,c("Time","Hour", "Users", "Word")] | |
k <- 1 | |
time <- obs$Time | |
hours <- sapply(time, FUN = function(x) | |
as.numeric(format(strptime(x, "%Y-%m-%d %H:%M:%S"), "%H"))) | |
aux_df <- obs | |
aux_df$Hours <- hours | |
hours_un <- unique(hours) | |
freq_hour <- data_frame() | |
for(i in hours_un){ | |
for(j in unique_us){ | |
freq_hour[k, j] <- sum(is.na(aux_df[aux_df$Users == j & aux_df$Hours == i, ]) == F) - 4 | |
if(freq_hour[k, j] < 0) | |
freq_hour[k, j] <- 0 | |
} | |
k <- k + 1 | |
} | |
freq_hour$Hours <- hours_un | |
freq_hour <- gather(freq_hour, User, Words, -Hours) | |
# We have the following data_frames | |
# obs: observations | |
# tidy_obs: like obs but tidy | |
# word_count: word count | |
# messages_users: number of messages per user | |
# words_users: number of words per user | |
# words_hour: words said per hour | |
# freq_hour: use of words per hour | |
clean_dataset <- list() | |
clean_dataset[[1]] <- obs | |
clean_dataset[[2]] <- tidy_obs | |
clean_dataset[[3]] <- word_count | |
clean_dataset[[4]] <- messages_users | |
clean_dataset[[5]] <- words_users | |
clean_dataset[[6]] <- words_hour | |
clean_dataset[[7]] <- freq_hour | |
names(clean_dataset) <- c("observations", "tidy_obs", "word_count", | |
"messages_users", "words_users", | |
"words_hour", "freq_hour") | |
return(clean_dataset) | |
} | |
# specific_func ----------------------------------------------------------- | |
# Specific_func, just to do a specific job | |
specific_func <- function(specif, unique_us, obs){ | |
if(specif){ | |
if(specif == 2){ | |
if(unique_us[1] == "Radrigo"){ | |
new_users <- c("Rodrigo", "Ana_Teresa") | |
}else{ | |
new_users <- c("Ana_Teresa", "Rodrigo") | |
} | |
} | |
if(specif == 1){ | |
unique_us <- unique(obs$Users) | |
nomb <- c("Fernanda", "Amauri", "Rodrigo", "Alicia", "Webster", "Vírgen", | |
"Yolanda", "Raúl", "Dulce", "Daniela", "Ceballos", "+5213322555291") | |
ped <- c("ndaA", "uriG", "adrig", "lic", "sW", "sV", | |
"ol", "lR", "eM", "yG", "oC", "32255") | |
new_users <- numeric() | |
for(i in 1:length(unique_us)){ | |
flag <- 0; j <- 1 | |
while(flag == 0){ | |
if(length(strsplit(unique_us[i], ped[j])[[1]]) == 2){ | |
new_users[i] <- nomb[j] | |
flag <- 1 | |
} | |
j <- j + 1 | |
} | |
} | |
} | |
Users <- numeric() | |
for(i in 1:length(obs$Users)){ | |
j <- 1 | |
flag <- 0 | |
while(flag == 0){ | |
if(obs$Users[i] == unique_us[j]){ | |
Users[i] <- new_users[j] | |
flag <- 1 | |
} | |
j <- j + 1 | |
} | |
} | |
Users <- factor(Users) | |
obs$Users <- Users | |
} | |
return(obs) | |
} | |
# word_filter ------------------------------------------------------------- | |
# filters the usual words in a given language. | |
word_filter <- function(data, lang = "spanish"){ | |
if(lang == "spanish"){ | |
excl <- c("Si", "si", "Sí","sí", "No", "no", "Que", "que", "Quien", | |
"quien", "Donde", "donde", "A", "a", "De", "de", | |
"La", "la", "El", "el", "En", "en", "Y", "y", "Se", | |
"se", "Es", "es", "Lo", "lo", "Con", "con", "Me", "me", | |
"Por", "por", "Te", "te", "Un", "un", "sÃ", "SÃ", "Ya", | |
"ya", "Tu", "tu", "Como", "como", "Las", "las", "más", "mas", | |
"mÃs", "Los", "los", "Para", "para", "mi", "eso", "una", "unas", | |
"Pero", "pero", "Le", "le", "Del", "del", "Haha", "jaja", "Jajaja", | |
"Jajajaja", "al", "estÃ", "o", "bien", "su", "Ây", "esta", "estoy", | |
"haha", "<media>", "va", "algo", "quÃ", "son", "hace", "dÃa", | |
"Jajajajaja", "porque", "asÃ", "tÃ", "les", "hasta", "cosas", | |
"ver", "voy", "Jaja", "cierto", "muy", "hay", "tengo", "entonces", | |
"he", "tus", "dos", "Pues", "ni", "cuenta", "tambiÃn", "habÃa", "ese", | |
"ha", "ti", "estÃn", "Âque", "tienes", "haciendo", "AsÃ", "pues", "bueno", | |
"Jajaa", "Ni", "hecho", "puedo", "Yo", "yo", "Ok", "todo", "Okay", "nada", | |
"cuando", "SÃlo", "dÃas", "maÃana", "Buenos", "😘", "ir", "tal", "Voy", | |
"Porque", "casa", "verdad", "vas", "pronto", "sÃlo", "Estoy", "Gracias", | |
"estar", "mis", "avisas", "digo", "haces", "Esta", "noches") | |
word_filter <- data | |
for(i in excl){ | |
word_filter <- word_filter[word_filter$Words != i, ] | |
} | |
return(word_filter) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment