RHDZMOTA · April 1, 2016 04:29
diff --git a/dataviz_whatsapp.R b/dataviz_whatsapp.R
 # Whatsapp's data visualization
 # by: Rodrigo Hernández Mota
 # file name: dataviz_whatsapp.R
 # V.1.0.0

 # General structure
 # This file was structured as data exploration process
 # was developing. For such reason, the main sections
 # contained and the visualizations are divided in:
 # - word frequency 
 # - usual words
 # - unusual words 
 # - word's lenght (underdeveloped)
 # - messaging per day
 # - word distribution in a day (underdeveloped)


 library(ggplot2)
 source("functions_whatsapp.R")

 available_data <- c("160322_chat_mufasa.txt",
                    "160322_atrh.txt",
                    "160324_chat_familia.txt",
                    "160325_chat_ds.txt")

 dataset <- clean_data(available_data[4])

 # choose user (for a future viz)
 unique_us   <- unique(dataset$observations$Users)
 aux_df      <- data_frame(User = unique_us[order(unique_us)])
 aux_df
 select_user <- aux_df$User[2]
 order_users <- c(as.character(aux_df[aux_df$User != select_user, ]),
                 as.character(select_user))


 # Word frequency ----------------------------------------------------------

 # Word frequency for the first 50%
 select <- dataset$word_count[dataset$word_count$Acum < 50, ]
 ggplot(select,aes(y = Rel, x = 1:(dim(select)[1]))) + theme_light() +
  geom_bar(stat = "identity",
           alpha = 0.3, fill = "blue", color = "black") +
  ggtitle("Word frequency ranking (freq.acum 50%)") +
  xlab("Ranking") + ylab("Frequency (%)")

 # Word frequency for the first 25%
 select <- dataset$word_count[dataset$word_count$Acum < 25, ]
 ggplot(select,aes(y = Rel, x = Words)) + theme_light() +
  scale_x_discrete(limits = select$Words) +
  geom_bar(stat = "identity",
           alpha = 0.3, fill = "dark red", color = "black") +
  ggtitle("Word frequency ranking (freq.acum 25%)") +
  xlab("Ranking") + ylab("Frequency (%)")



 # Usual Words -------------------------------------------------------------

 # filter the common words
 select <- word_filter(dataset$word_count)
 ggplot(select[1:25, ]) + 
  geom_bar(stat = "identity", aes(y = Rel, x = Words),
           alpha = 0.7, fill = "dark red", color = "black") + 
  scale_x_discrete(limits = select[1:25, ]$Words) +
  theme_light() +
  ggtitle("Most used 25 words (filtered)") +
  xlab(" Word") + ylab("Frequency (%)")

 # Unusual Words -----------------------------------------------------------

 # Words frequenct, last 1%
 select <- dataset$word_count[dataset$word_count$Acum > 99.9, ]
 ggplot(select) + theme_light() +
  geom_bar(stat = "identity", aes(y = Rel, x = Words),
           alpha = 0.7, fill = "dark gray", color = "black") + 
  scale_x_discrete(limits = select$Words) +
  ggtitle("Less used words (1% of acummulated use)") +
  xlab(" Palabra") + ylab("Frecuencia Relativa (%)")

 # Unique words - 1 single use
 select <- dataset$word_count[dataset$word_count$Count == 1, ]
 select <- as.data.frame(select)
 tot_unw <- nrow(select)
 rand_index <- floor(runif(20)*(tot_unw-1)) + 1 
 rand_unique_words <- select[rand_index, "Words"]
 beginning <- sapply(rand_unique_words, strsplit, "")
 len <- sapply(beginning, length)
 len2 <- 0; beg <- numeric(); j <- 1
 for(i in len){
  beg[j] <- unlist(beginning)[len2 + 1]
  len2 <- i + len2
  j <- j + 1
 }
 select <- data_frame(Words = rand_unique_words,
                             Beginning = beg,
                             Length = len)


 ggplot(select, aes(Beginning, Length,label = Words)) + 
  geom_point() + geom_text(hjust = 0, size = 5,
                           nudge_x = 0.05, color = "dark blue") + 
  theme_bw() + theme(
    legend.background = element_rect(fill = "white"),
    panel.grid.major = element_line(colour = "grey"),
    panel.grid.minor = element_blank()) +
  #geom_label(fontface = "bold") +
  ggtitle("20 random words used just one time") +
  xlab("Initial letter") + ylab("Word's length")


 # Word length -------------------------------------------------------------

 # not meaningful
 ggplot(dataset$word_count, aes(Word_lenght, Count)) + 
  geom_point(aes(size = id_nl), alpha = 0.3, color = "dark red") +
  theme_light() + xlim(0, 15) +
  theme(legend.position="none") +
  ggtitle("Frequency and length") + 
  xlab("Length") + ylab("Frequency") +
  annotate("text", x = 3*max(dataset$word_count$Word_lenght)/4,
           y = max(dataset$word_count$Count)*0.8,
           label = "El área de los puntos es proporcional 
           al número de palabras con la misma 
           longitud y frecuencia", size = 3)

 # kind of meaningful 
 ggplot(dataset$word_count, aes(x = Word_lenght, y = ..density..)) +
  geom_density(fill = "blue", alpha = 0.3) + xlim(0, 15) + theme_light() +
  ggtitle("Estimated density") + 
  xlab("Length of the word") + ylab("Density")

 # Messaging per day -------------------------------------------------------

 # Number of words messaged by day
 ggplot(dataset$words_users, aes(x = Date, y = Words, color = User)) +
  geom_point() +
  theme_light() + 
  guides(color = guide_legend(ncol=1)) +
  ggtitle("Number of words send by user") + 
  xlab("Day") + 
  ylab("Number")

 # Distribution of number of words send per day
 unique_us <- unique(dataset$observations$Users)
 if(length(unique_us) > 2){
  
  ggplot(NULL) + 
    geom_density(data = dataset$words_users[
      dataset$words_users$User != select_user, ],
                 aes(x = Words, y = ..density.., colour = User)) +
    guides(fill = guide_legend(ncol=2)) +
    geom_density(data = dataset$words_users[
      dataset$words_users$User == select_user, ],
                 aes(x = Words, y = ..density.., fill = User), alpha = 0.35) +
    scale_fill_manual(name = "Selected",values = "blue") +
    guides(fill = guide_legend(ncol=2)) +
    theme_light() + 
    xlim(0, max(dataset$words_users$Words)/4) +
    ggtitle("Number of words send by day") + 
    xlab("Quantity") + 
    ylab("Density")
  
 } else {
  
  ggplot(dataset$words_users, aes(x = Words, y = ..density.., fill = User)) + 
    geom_density(alpha = 0.2) +
    theme_light() + 
    xlim(0, max(dataset$words_users$Words)/4) +
    ggtitle("Number of words send by day") + 
    xlab("Quantity") + 
    ylab("Density") +
    guides(fill = guide_legend(ncol=1))

 }

 ggplot(dataset$words_users, aes(x = Words, y = ..density..)) +
  geom_density(alpha = 0.1, fill = "gray") +
  facet_wrap(~ User, scale = "free") +
  theme_light() +
  ggtitle("Words send by day") + 
  xlab("Quantity") + 
  ylab("Density")

 # Number of words distributed in a day ------------------------------------
 # Still under development.

 ggplot(dataset$words_hour, aes(x=Hour,y = ..density..)) +
  geom_density(alpha = 0.3, fill = "blue") +
  theme_light() +
  ggtitle("Words send according to the hour") + 
  xlab("Time of the day") + 
  ylab("Density")

 if(length(unique_us) > 2){
  # choose user

  ggplot(NULL) + 
    geom_density(data = dataset$words_hour[dataset$words_hour$Users != select_user, ],
                 aes(x = Hour, y = ..density.., colour = Users)) +
    guides(fill = guide_legend(ncol=2)) +
    geom_density(data = dataset$words_hour[dataset$words_hour$Users == select_user, ],
                 aes(x = Hour, y = ..density.., fill = Users), alpha = 0.35) +
    scale_fill_manual(name = "Selected",values = "blue") +
    guides(fill = guide_legend(ncol=2)) +
    theme_light() + 
    #xlim(0, x_lim[2]) +
    ggtitle("Función de densidad: frecuencia de palabras enviadas por usuario según la hora") + 
    xlab("Hora del día") + 
    ylab("Frecuencia relativa")
  
 } else {
  
  ggplot(dataset$words_hour, aes(x=Hour,y = ..density.., fill = Users)) +
    geom_density(alpha = 0.1) +
    theme_light() +
    ggtitle("Función de densidad: frecuencia de palabras enviadas según la hora por usuario") + 
    xlab("Hora del día") + 
    ylab("Frecuencia relativa")
  
 }


 ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
  geom_bar(stat = "identity", width = 1, color = "black") + #scale_fill_brewer() +
  coord_polar()

 ggplot(dataset$freq_hour, aes(x = Hours, y = Words, color = User)) + theme_minimal() +
  geom_point() + #scale_fill_brewer() +
  coord_polar()

 ggplot(dataset$freq_hour, aes(x = factor(1), y = Words, fill = User)) + 
  geom_bar(stat = "identity") + coord_polar()

 ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
  geom_bar(stat = "identity", position = "fill") + #scale_fill_brewer() +
  coord_polar()

 ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
  geom_bar(stat = "identity", position = "stack") + #scale_fill_brewer() +
  coord_polar()

 ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
  geom_bar(stat = "identity", position = "dodge") +
  coord_polar()

 ggplot(dataset$words_hour, aes(x = Hour, y = ..density.., fill = Users)) +
  geom_density(alpha = 0.1) + coord_polar()

 ggplot(dataset$words_hour, aes(x = Hour, y = ..density..)) +
  geom_density(alpha = 0.1, fill = "gray") +
  facet_wrap(~ Users) +
  theme_light() +
  ggtitle("Prob") + 
  xlab("Time of the day") + 
  ylab("Density")
	# Whatsapp's data visualization
	# by: Rodrigo Hernández Mota
	# file name: dataviz_whatsapp.R
	# V.1.0.0

	# General structure
	# This file was structured as data exploration process
	# was developing. For such reason, the main sections
	# contained and the visualizations are divided in:
	# - word frequency
	# - usual words
	# - unusual words
	# - word's lenght (underdeveloped)
	# - messaging per day
	# - word distribution in a day (underdeveloped)


	library(ggplot2)
	source("functions_whatsapp.R")

	available_data <- c("160322_chat_mufasa.txt",
	"160322_atrh.txt",
	"160324_chat_familia.txt",
	"160325_chat_ds.txt")

	dataset <- clean_data(available_data[4])

	# choose user (for a future viz)
	unique_us <- unique(dataset$observations$Users)
	aux_df <- data_frame(User = unique_us[order(unique_us)])
	aux_df
	select_user <- aux_df$User[2]
	order_users <- c(as.character(aux_df[aux_df$User != select_user, ]),
	as.character(select_user))


	# Word frequency ----------------------------------------------------------

	# Word frequency for the first 50%
	select <- dataset$word_count[dataset$word_count$Acum < 50, ]
	ggplot(select,aes(y = Rel, x = 1:(dim(select)[1]))) + theme_light() +
	geom_bar(stat = "identity",
	alpha = 0.3, fill = "blue", color = "black") +
	ggtitle("Word frequency ranking (freq.acum 50%)") +
	xlab("Ranking") + ylab("Frequency (%)")

	# Word frequency for the first 25%
	select <- dataset$word_count[dataset$word_count$Acum < 25, ]
	ggplot(select,aes(y = Rel, x = Words)) + theme_light() +
	scale_x_discrete(limits = select$Words) +
	geom_bar(stat = "identity",
	alpha = 0.3, fill = "dark red", color = "black") +
	ggtitle("Word frequency ranking (freq.acum 25%)") +
	xlab("Ranking") + ylab("Frequency (%)")



	# Usual Words -------------------------------------------------------------

	# filter the common words
	select <- word_filter(dataset$word_count)
	ggplot(select[1:25, ]) +
	geom_bar(stat = "identity", aes(y = Rel, x = Words),
	alpha = 0.7, fill = "dark red", color = "black") +
	scale_x_discrete(limits = select[1:25, ]$Words) +
	theme_light() +
	ggtitle("Most used 25 words (filtered)") +
	xlab(" Word") + ylab("Frequency (%)")

	# Unusual Words -----------------------------------------------------------

	# Words frequenct, last 1%
	select <- dataset$word_count[dataset$word_count$Acum > 99.9, ]
	ggplot(select) + theme_light() +
	geom_bar(stat = "identity", aes(y = Rel, x = Words),
	alpha = 0.7, fill = "dark gray", color = "black") +
	scale_x_discrete(limits = select$Words) +
	ggtitle("Less used words (1% of acummulated use)") +
	xlab(" Palabra") + ylab("Frecuencia Relativa (%)")

	# Unique words - 1 single use
	select <- dataset$word_count[dataset$word_count$Count == 1, ]
	select <- as.data.frame(select)
	tot_unw <- nrow(select)
	rand_index <- floor(runif(20)*(tot_unw-1)) + 1
	rand_unique_words <- select[rand_index, "Words"]
	beginning <- sapply(rand_unique_words, strsplit, "")
	len <- sapply(beginning, length)
	len2 <- 0; beg <- numeric(); j <- 1
	for(i in len){
	beg[j] <- unlist(beginning)[len2 + 1]
	len2 <- i + len2
	j <- j + 1
	}
	select <- data_frame(Words = rand_unique_words,
	Beginning = beg,
	Length = len)


	ggplot(select, aes(Beginning, Length,label = Words)) +
	geom_point() + geom_text(hjust = 0, size = 5,
	nudge_x = 0.05, color = "dark blue") +
	theme_bw() + theme(
	legend.background = element_rect(fill = "white"),
	panel.grid.major = element_line(colour = "grey"),
	panel.grid.minor = element_blank()) +
	#geom_label(fontface = "bold") +
	ggtitle("20 random words used just one time") +
	xlab("Initial letter") + ylab("Word's length")


	# Word length -------------------------------------------------------------

	# not meaningful
	ggplot(dataset$word_count, aes(Word_lenght, Count)) +
	geom_point(aes(size = id_nl), alpha = 0.3, color = "dark red") +
	theme_light() + xlim(0, 15) +
	theme(legend.position="none") +
	ggtitle("Frequency and length") +
	xlab("Length") + ylab("Frequency") +
	annotate("text", x = 3*max(dataset$word_count$Word_lenght)/4,
	y = max(dataset$word_count$Count)*0.8,
	label = "El área de los puntos es proporcional
	al número de palabras con la misma
	longitud y frecuencia", size = 3)

	# kind of meaningful
	ggplot(dataset$word_count, aes(x = Word_lenght, y = ..density..)) +
	geom_density(fill = "blue", alpha = 0.3) + xlim(0, 15) + theme_light() +
	ggtitle("Estimated density") +
	xlab("Length of the word") + ylab("Density")

	# Messaging per day -------------------------------------------------------

	# Number of words messaged by day
	ggplot(dataset$words_users, aes(x = Date, y = Words, color = User)) +
	geom_point() +
	theme_light() +
	guides(color = guide_legend(ncol=1)) +
	ggtitle("Number of words send by user") +
	xlab("Day") +
	ylab("Number")

	# Distribution of number of words send per day
	unique_us <- unique(dataset$observations$Users)
	if(length(unique_us) > 2){

	ggplot(NULL) +
	geom_density(data = dataset$words_users[
	dataset$words_users$User != select_user, ],
	aes(x = Words, y = ..density.., colour = User)) +
	guides(fill = guide_legend(ncol=2)) +
	geom_density(data = dataset$words_users[
	dataset$words_users$User == select_user, ],
	aes(x = Words, y = ..density.., fill = User), alpha = 0.35) +
	scale_fill_manual(name = "Selected",values = "blue") +
	guides(fill = guide_legend(ncol=2)) +
	theme_light() +
	xlim(0, max(dataset$words_users$Words)/4) +
	ggtitle("Number of words send by day") +
	xlab("Quantity") +
	ylab("Density")

	} else {

	ggplot(dataset$words_users, aes(x = Words, y = ..density.., fill = User)) +
	geom_density(alpha = 0.2) +
	theme_light() +
	xlim(0, max(dataset$words_users$Words)/4) +
	ggtitle("Number of words send by day") +
	xlab("Quantity") +
	ylab("Density") +
	guides(fill = guide_legend(ncol=1))

	}

	ggplot(dataset$words_users, aes(x = Words, y = ..density..)) +
	geom_density(alpha = 0.1, fill = "gray") +
	facet_wrap(~ User, scale = "free") +
	theme_light() +
	ggtitle("Words send by day") +
	xlab("Quantity") +
	ylab("Density")

	# Number of words distributed in a day ------------------------------------
	# Still under development.

	ggplot(dataset$words_hour, aes(x=Hour,y = ..density..)) +
	geom_density(alpha = 0.3, fill = "blue") +
	theme_light() +
	ggtitle("Words send according to the hour") +
	xlab("Time of the day") +
	ylab("Density")

	if(length(unique_us) > 2){
	# choose user

	ggplot(NULL) +
	geom_density(data = dataset$words_hour[dataset$words_hour$Users != select_user, ],
	aes(x = Hour, y = ..density.., colour = Users)) +
	guides(fill = guide_legend(ncol=2)) +
	geom_density(data = dataset$words_hour[dataset$words_hour$Users == select_user, ],
	aes(x = Hour, y = ..density.., fill = Users), alpha = 0.35) +
	scale_fill_manual(name = "Selected",values = "blue") +
	guides(fill = guide_legend(ncol=2)) +
	theme_light() +
	#xlim(0, x_lim[2]) +
	ggtitle("Función de densidad: frecuencia de palabras enviadas por usuario según la hora") +
	xlab("Hora del día") +
	ylab("Frecuencia relativa")

	} else {

	ggplot(dataset$words_hour, aes(x=Hour,y = ..density.., fill = Users)) +
	geom_density(alpha = 0.1) +
	theme_light() +
	ggtitle("Función de densidad: frecuencia de palabras enviadas según la hora por usuario") +
	xlab("Hora del día") +
	ylab("Frecuencia relativa")

	}


	ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
	geom_bar(stat = "identity", width = 1, color = "black") + #scale_fill_brewer() +
	coord_polar()

	ggplot(dataset$freq_hour, aes(x = Hours, y = Words, color = User)) + theme_minimal() +
	geom_point() + #scale_fill_brewer() +
	coord_polar()

	ggplot(dataset$freq_hour, aes(x = factor(1), y = Words, fill = User)) +
	geom_bar(stat = "identity") + coord_polar()

	ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
	geom_bar(stat = "identity", position = "fill") + #scale_fill_brewer() +
	coord_polar()

	ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
	geom_bar(stat = "identity", position = "stack") + #scale_fill_brewer() +
	coord_polar()

	ggplot(dataset$freq_hour, aes(x = Hours, y = Words, fill = User)) + theme_minimal() +
	geom_bar(stat = "identity", position = "dodge") +
	coord_polar()

	ggplot(dataset$words_hour, aes(x = Hour, y = ..density.., fill = Users)) +
	geom_density(alpha = 0.1) + coord_polar()

	ggplot(dataset$words_hour, aes(x = Hour, y = ..density..)) +
	geom_density(alpha = 0.1, fill = "gray") +
	facet_wrap(~ Users) +
	theme_light() +
	ggtitle("Prob") +
	xlab("Time of the day") +
	ylab("Density")