vpnagraj · April 9, 2024 14:17
diff --git a/eda_tools.R b/eda_tools.R
 ###############################################################################
 ## brief demo of exploratory data analysis (EDA) tools for data frames in R
 ## NOTE: the code below is intended to preview the EDA tools ...
 ## ... it does not exhaustively demonstrate functionality for these tools ...
 ## ... and it is current as of 2024-04-09 ... 
 ## ... for more information refer to the documentation for each package
 ###############################################################################

 ###############################################################################
 ## set up
 ## load tidyverse for data manipulation
 ## NOTE: tidyverse includes multiple packages ...
 ## dplyr, tidyr, ggplot2, and more 
 library(tidyverse)
 ## load palmerpenguins for the data itself
 library(palmerpenguins)

 ## what is the :: all about?
 penguins
 palmerpenguins::penguins
 all.equal(penguins, palmerpenguins::penguins)

 ## what is that strange set of symbols in the code below?
 ?`%>%`

 ###############################################################################

 ## can't we just explore the data "manually"?
 ## using base R summary to get basic summary stats
 penguins %>%
  summary(.)

 ## using dplyr glimpse() to peek at the first several values of each column
 penguins %>%
  glimpse(.)

 ## counting up missing-ness for a single column with dplyr 
 penguins %>%
  summarise(., n_missing_sex = sum(is.na(sex)))

 ###############################################################################
 ## purpose-built tools for EDA ...
 ###############################################################################

 ###############################################################################
 ## load the skimr package
 library(skimr)

 ## quickly "skim" the data for an informative summary of all columns
 penguins %>%
  skim(.)

 ###############################################################################

 ###############################################################################
 ## load the dlookr package
 library(dlookr)

 ## overall "diagnosis" of variables in the data frame
 penguins %>%
  diagnose(.)

 ## specific summary of categorical variables
 penguins %>%
  diagnose_category(.)

 ## outlier detection and summary
 penguins %>%
  diagnose_outlier(.)

 ## example of using dplyr verbs (e.g., group_by) to refine EDA summaries
 penguins %>%
  group_by(., sex) %>%
  diagnose_outlier(., -"sex")

 ###############################################################################

 ###############################################################################
 ## load the naniar package
 library(naniar)

 ## plot of the counts of missing values for each variable
 penguins %>%
  gg_miss_var(.)

 ## example of applying imputation ...
 ## in this case fixing all imputed values for sex to be "male"
 penguins %>%
  mutate(., sex = impute_factor(sex, value = "male"))

 ## another example of applying imputation ...
 ## in this case randomly sampling "male" or "female"
 penguins %>%
  mutate(., sex = impute_factor(sex, value = sample(c("male","female"), size = 1, prob = c(0.3,0.7))))

 ###############################################################################

 ###############################################################################
 ## next steps
 ## the tools above are useful for EDA of pre/post-processed data frames
 ## as an extra step (especially for post-processing) you may need *validation
 ## a few R packages to consider for data validation ...
 ## pointblank (https://rstudio.github.io/pointblank/)
 ## assertr (https://docs.ropensci.org/assertr/)
 ## validate (https://cran.r-project.org/web/packages/validate/vignettes/cookbook.html)
 ###############################################################################
	###############################################################################
	## brief demo of exploratory data analysis (EDA) tools for data frames in R
	## NOTE: the code below is intended to preview the EDA tools ...
	## ... it does not exhaustively demonstrate functionality for these tools ...
	## ... and it is current as of 2024-04-09 ...
	## ... for more information refer to the documentation for each package
	###############################################################################

	###############################################################################
	## set up
	## load tidyverse for data manipulation
	## NOTE: tidyverse includes multiple packages ...
	## dplyr, tidyr, ggplot2, and more
	library(tidyverse)
	## load palmerpenguins for the data itself
	library(palmerpenguins)

	## what is the :: all about?
	penguins
	palmerpenguins::penguins
	all.equal(penguins, palmerpenguins::penguins)

	## what is that strange set of symbols in the code below?
	?`%>%`

	###############################################################################

	## can't we just explore the data "manually"?
	## using base R summary to get basic summary stats
	penguins %>%
	summary(.)

	## using dplyr glimpse() to peek at the first several values of each column
	penguins %>%
	glimpse(.)

	## counting up missing-ness for a single column with dplyr
	penguins %>%
	summarise(., n_missing_sex = sum(is.na(sex)))

	###############################################################################
	## purpose-built tools for EDA ...
	###############################################################################

	###############################################################################
	## load the skimr package
	library(skimr)

	## quickly "skim" the data for an informative summary of all columns
	penguins %>%
	skim(.)

	###############################################################################

	###############################################################################
	## load the dlookr package
	library(dlookr)

	## overall "diagnosis" of variables in the data frame
	penguins %>%
	diagnose(.)

	## specific summary of categorical variables
	penguins %>%
	diagnose_category(.)

	## outlier detection and summary
	penguins %>%
	diagnose_outlier(.)

	## example of using dplyr verbs (e.g., group_by) to refine EDA summaries
	penguins %>%
	group_by(., sex) %>%
	diagnose_outlier(., -"sex")

	###############################################################################

	###############################################################################
	## load the naniar package
	library(naniar)

	## plot of the counts of missing values for each variable
	penguins %>%
	gg_miss_var(.)

	## example of applying imputation ...
	## in this case fixing all imputed values for sex to be "male"
	penguins %>%
	mutate(., sex = impute_factor(sex, value = "male"))

	## another example of applying imputation ...
	## in this case randomly sampling "male" or "female"
	penguins %>%
	mutate(., sex = impute_factor(sex, value = sample(c("male","female"), size = 1, prob = c(0.3,0.7))))

	###############################################################################

	###############################################################################
	## next steps
	## the tools above are useful for EDA of pre/post-processed data frames
	## as an extra step (especially for post-processing) you may need *validation
	## a few R packages to consider for data validation ...
	## pointblank (https://rstudio.github.io/pointblank/)
	## assertr (https://docs.ropensci.org/assertr/)
	## validate (https://cran.r-project.org/web/packages/validate/vignettes/cookbook.html)
	###############################################################################