bgall · October 26, 2019 21:05
diff --git a/scale_creation_for_students b/scale_creation_for_students
 #########################################################################################
 # This gist contains a quick walk-through of several ways to produce scales capturing
 # the average value of one or more variables. Each row (observation) gets its own
 # value. We'll assume your data are not fully "tidy." What I mean by this is that you
 # have an observation for each row and you want to calculate that observation's value
 # on the scale, but each variable that should go into your scale is in its own column.
 #########################################################################################

 #########################################################################################
 # Set-up (packages, fake data, etc.)
 #########################################################################################

 # Load dplyr
 library(dplyr)

 # Set seed 
 set.seed(9)

 # Create some fake data with variables x, y, z, w
 # for example purposes
 mydata <- data.frame(x = rnorm(10),
                     y = runif(10),
                     z = seq(1:10),
                     w = c(1,rep(NA,9)))

 # Let's take a look at our raw data. Note
 # that we have missing values in variable w.
 mydata

 #########################################################################################
 # Scale creation
 #########################################################################################

 # We'll now create 2 scales. Each scale will 
 # capture the average of two variable in our data.

 # Which variables? Let's choose two variables
 # for each of the scales: scale1 is going
 # to reflect the mean of variables named x and y,
 # scale2 will reflect the average of variables z and y.

 scale1_vars <- c("x","y")
 scale2_vars <- c("y","z")

 # Now let's generate our scales. If a row is missing a 
 # value for one of the variables, the average will be 
 # computed from the other variable if it is not also 
 # missing. If both values are missing, the scale value
 # will be NA.

 # The code below tells the mutate() function from the dplyr package
 # that we want to generate new variables (scale1 and scale2) by 
 # calculating the average of the values observed in each row for
 # only those columns containing the variables we previously included
 # in scale1_vars or scale2_vars. A different scale is generated for 
 # each group of variables.
 fulldf <- mydata %>% dplyr::mutate(scale1 = rowMeans(mydata[ ,scale1_vars], na.rm = TRUE),
                                   scale2 = rowMeans(mydata[ ,scale2_vars], na.rm = TRUE))
                                   
 # A benefit of the above approach is that you can put any variables you
 # want into scales1_vars or scales2_vars and you don't need to know how
 # many variables you put in - the code will simply calculate the average
 # across all of the columns.

 # Now let's calculate the averages manually. We can do this by 
 # adding up the variables we want in our scale, then dividing by the 
 # number of variables to get the average of the variables.
 fulldf <- fulldf %>% dplyr::mutate(scale1_manual = (x + y) / 2,
                                   scale2_manual = (y + z) / 2)
                                   
 # Let's make sure the manual approach and the other approach produce
 # the same result. We can use the identical() function to tell us if the
 # two variables are identical! TRUE if so, FALSE if not.
 identical(fulldf$scale1_manual, fulldf$scale1) # TRUE
 identical(fulldf$scale2_manual, fulldf$scale2) # TRUE
                              
 # Based on the above, our approach works! The former approach is nice because
 # (1) you don't need to type out the # of variables you are including in the scale
 # and so don't run the risk of forgetting to change the number you're dividing 
 # by if/when you change the number of items in your scale and (2) it calculates
 # averages for rows using the available variables where there is no missingness
 # rather than simply returning NA if ANY variable is missing. On the other hand,
 # (a) it is more lines of code because you first say which sets of variables you
 # want to include in your scales in some lines of code then actually generate
 # the scales in some more code, (b) it's not so obvious mathematically what you
 # are doing unless you immdiately see "rowMeans" and know that means it is
 # calculating the average of the rows for each scale, and (c) you might want
 # to drop any observation with missing data rather than simply use the available 
 # data to calculate an average. For example, if we make a scale using the variable
 # with missing data (w) then we will find the two approaches produce different 
 # results:

 # Approach 1
 scale3_vars <- c("w","z")
 fulldf <- fulldf %>% dplyr::mutate(scale3 = rowMeans(fulldf[ ,scale3_vars], na.rm = TRUE))

 # Approach 2
 fulldf <- fulldf %>% dplyr::mutate(scale3_manual = (w + z) / 2)

 # Test if identical
 identical(fulldf$scale3, fulldf$scale3_manual)

 # To see why, here are the computed scale values for the two different approaches:
 fulldf$scale3
 fulldf$scale3_manual

 # Note: there are more efficient ways of doing this as well, such as writing a 
 # function to implement the second approach simply by giving the function the
 # variable names and scale name, while also building in functionality to let
 # the user choose to include or omit variables with missing values. The 
 # examples above are reasonable ways of doing this for those just learning R
 # and/or the Tidyverse.
	#########################################################################################
	# This gist contains a quick walk-through of several ways to produce scales capturing
	# the average value of one or more variables. Each row (observation) gets its own
	# value. We'll assume your data are not fully "tidy." What I mean by this is that you
	# have an observation for each row and you want to calculate that observation's value
	# on the scale, but each variable that should go into your scale is in its own column.
	#########################################################################################

	#########################################################################################
	# Set-up (packages, fake data, etc.)
	#########################################################################################

	# Load dplyr
	library(dplyr)

	# Set seed
	set.seed(9)

	# Create some fake data with variables x, y, z, w
	# for example purposes
	mydata <- data.frame(x = rnorm(10),
	y = runif(10),
	z = seq(1:10),
	w = c(1,rep(NA,9)))

	# Let's take a look at our raw data. Note
	# that we have missing values in variable w.
	mydata

	#########################################################################################
	# Scale creation
	#########################################################################################

	# We'll now create 2 scales. Each scale will
	# capture the average of two variable in our data.

	# Which variables? Let's choose two variables
	# for each of the scales: scale1 is going
	# to reflect the mean of variables named x and y,
	# scale2 will reflect the average of variables z and y.

	scale1_vars <- c("x","y")
	scale2_vars <- c("y","z")

	# Now let's generate our scales. If a row is missing a
	# value for one of the variables, the average will be
	# computed from the other variable if it is not also
	# missing. If both values are missing, the scale value
	# will be NA.

	# The code below tells the mutate() function from the dplyr package
	# that we want to generate new variables (scale1 and scale2) by
	# calculating the average of the values observed in each row for
	# only those columns containing the variables we previously included
	# in scale1_vars or scale2_vars. A different scale is generated for
	# each group of variables.
	fulldf <- mydata %>% dplyr::mutate(scale1 = rowMeans(mydata[ ,scale1_vars], na.rm = TRUE),
	scale2 = rowMeans(mydata[ ,scale2_vars], na.rm = TRUE))

	# A benefit of the above approach is that you can put any variables you
	# want into scales1_vars or scales2_vars and you don't need to know how
	# many variables you put in - the code will simply calculate the average
	# across all of the columns.

	# Now let's calculate the averages manually. We can do this by
	# adding up the variables we want in our scale, then dividing by the
	# number of variables to get the average of the variables.
	fulldf <- fulldf %>% dplyr::mutate(scale1_manual = (x + y) / 2,
	scale2_manual = (y + z) / 2)

	# Let's make sure the manual approach and the other approach produce
	# the same result. We can use the identical() function to tell us if the
	# two variables are identical! TRUE if so, FALSE if not.
	identical(fulldf$scale1_manual, fulldf$scale1) # TRUE
	identical(fulldf$scale2_manual, fulldf$scale2) # TRUE

	# Based on the above, our approach works! The former approach is nice because
	# (1) you don't need to type out the # of variables you are including in the scale
	# and so don't run the risk of forgetting to change the number you're dividing
	# by if/when you change the number of items in your scale and (2) it calculates
	# averages for rows using the available variables where there is no missingness
	# rather than simply returning NA if ANY variable is missing. On the other hand,
	# (a) it is more lines of code because you first say which sets of variables you
	# want to include in your scales in some lines of code then actually generate
	# the scales in some more code, (b) it's not so obvious mathematically what you
	# are doing unless you immdiately see "rowMeans" and know that means it is
	# calculating the average of the rows for each scale, and (c) you might want
	# to drop any observation with missing data rather than simply use the available
	# data to calculate an average. For example, if we make a scale using the variable
	# with missing data (w) then we will find the two approaches produce different
	# results:

	# Approach 1
	scale3_vars <- c("w","z")
	fulldf <- fulldf %>% dplyr::mutate(scale3 = rowMeans(fulldf[ ,scale3_vars], na.rm = TRUE))

	# Approach 2
	fulldf <- fulldf %>% dplyr::mutate(scale3_manual = (w + z) / 2)

	# Test if identical
	identical(fulldf$scale3, fulldf$scale3_manual)

	# To see why, here are the computed scale values for the two different approaches:
	fulldf$scale3
	fulldf$scale3_manual

	# Note: there are more efficient ways of doing this as well, such as writing a
	# function to implement the second approach simply by giving the function the
	# variable names and scale name, while also building in functionality to let
	# the user choose to include or omit variables with missing values. The
	# examples above are reasonable ways of doing this for those just learning R
	# and/or the Tidyverse.