jdittrich · January 20, 2024 16:42
diff --git a/expand multiple choice to bool columns.R b/expand multiple choice to bool columns.R
 # Suvery tools often put multiple choice questions’s answers as a single value with concatenated strings: 
 # If asking which programming languages from a list people use, tools will output "C;Python;R" if users selected these options.
 # These will be put in a column representing the question’s answers. However, this format is hard to work with.
 #
 # The code below creates a %>%-able function "stringColToBoolMatches"
 #
 # parameters: df <tibble>, stringColName<character>, name of the column with the answers, separator<character> e.g. ";"
 # returning: a new dataframe that keeps the original column but expands its values to boolean columns, one for each multiple choice option
 #
 # the columns are named after the unique answer option and can be NA (original value was NA), True (value present) or False (value not present)
 # e.g. if the column "lang" had a value like "C;Python;R", in the same row the columns "lang_C", "lang_Python", "lang_R" will be True, 
 # but "lang_Java", "lang_go", etc. will be False. 

 library(tidyverse)

 # parameters: df <tibble>, columnName<character>, separator<character>
 # returns: <character> vector 
 getUniqueValuesFromStringColumn <- function(df,columnName,separator){
  column <- df[[columnName]]
  paste(column, collapse = separator) %>%
  strsplit(split=separator) %>%
  unlist %>%
  unique%>%
  sort(decreasing = TRUE)
 }

 # parameters: answersValue<character> e.g. "C; python; Java", optionToCheck<character> e.g. "Java", separator<character> e.g. ";"
 # returns: <boolean>
 isInAnswerOptions<- function(answersValue,optionToCheck,separator){
  if(is.na(answersValue)) return(NA)
  answerList <- strsplit(answersValue,split=separator)
  optionToCheck %in% unlist(answerList) #unlist cause %in% needs vector
 }


 # parameters: df <tibble>, stringColName<character>, name of the column with the answers, answerOption<character> e.g. "Java", separator<character> e.g. ";"
 # returns: <tibble>
 addBoolColForAnswerOption <- function(df,stringColName,answerOption,separator){
  stringCol <- df[[stringColName]]
  inAnswerOptionsPrefilled <- partial(isInAnswerOptions, optionToCheck=answerOption, separator=separator)
  newdf <- mutate(df,"{stringColName}_{answerOption}":=map_lgl(stringCol,inAnswerOptionsPrefilled), .after=ends_with(stringColName))
 } 

 # df <tibble>, stringColName<character>, name of the column with the answers, separator<character> e.g. ";"
 # returns: <tibble>
 stringColToBoolMatches <- function(df,stringColName,separator){
  uniqueValues <- getUniqueValuesFromStringColumn(df,stringColName,separator)
  for (subColValue in uniqueValues){
    df <- addBoolColForAnswerOption(df,stringColName,subColValue,separator)
  }
  df
 }

 # can be used like so: 
 survey_results_public <- read_csv("C:/Users/jan/Downloads/survey_results_public.csv") # https://insights.stackoverflow.com/survey

 so_forAnalysis <- survey_results_public %>%
 stringColToBoolMatches(stringColName = "LearnCode",separator = ";")%>%
 stringColToBoolMatches(stringColName = "LearnCodeOnline", separator = ";")%>%
 stringColToBoolMatches(stringColName = "LanguageHaveWorkedWith", separator = ";")%>%
 stringColToBoolMatches(stringColName = "CodingActivities", separator = ";")%>%
 select(-ends_with("_NA")) # NAs are currently put in their own column, I guess cause they are cast to strings. Might fix at some point.
	# Suvery tools often put multiple choice questions’s answers as a single value with concatenated strings:
	# If asking which programming languages from a list people use, tools will output "C;Python;R" if users selected these options.
	# These will be put in a column representing the question’s answers. However, this format is hard to work with.
	#
	# The code below creates a %>%-able function "stringColToBoolMatches"
	#
	# parameters: df <tibble>, stringColName<character>, name of the column with the answers, separator<character> e.g. ";"
	# returning: a new dataframe that keeps the original column but expands its values to boolean columns, one for each multiple choice option
	#
	# the columns are named after the unique answer option and can be NA (original value was NA), True (value present) or False (value not present)
	# e.g. if the column "lang" had a value like "C;Python;R", in the same row the columns "lang_C", "lang_Python", "lang_R" will be True,
	# but "lang_Java", "lang_go", etc. will be False.

	library(tidyverse)

	# parameters: df <tibble>, columnName<character>, separator<character>
	# returns: <character> vector
	getUniqueValuesFromStringColumn <- function(df,columnName,separator){
	column <- df[[columnName]]
	paste(column, collapse = separator) %>%
	strsplit(split=separator) %>%
	unlist %>%
	unique%>%
	sort(decreasing = TRUE)
	}

	# parameters: answersValue<character> e.g. "C; python; Java", optionToCheck<character> e.g. "Java", separator<character> e.g. ";"
	# returns: <boolean>
	isInAnswerOptions<- function(answersValue,optionToCheck,separator){
	if(is.na(answersValue)) return(NA)
	answerList <- strsplit(answersValue,split=separator)
	optionToCheck %in% unlist(answerList) #unlist cause %in% needs vector
	}


	# parameters: df <tibble>, stringColName<character>, name of the column with the answers, answerOption<character> e.g. "Java", separator<character> e.g. ";"
	# returns: <tibble>
	addBoolColForAnswerOption <- function(df,stringColName,answerOption,separator){
	stringCol <- df[[stringColName]]
	inAnswerOptionsPrefilled <- partial(isInAnswerOptions, optionToCheck=answerOption, separator=separator)
	newdf <- mutate(df,"{stringColName}_{answerOption}":=map_lgl(stringCol,inAnswerOptionsPrefilled), .after=ends_with(stringColName))
	}

	# df <tibble>, stringColName<character>, name of the column with the answers, separator<character> e.g. ";"
	# returns: <tibble>
	stringColToBoolMatches <- function(df,stringColName,separator){
	uniqueValues <- getUniqueValuesFromStringColumn(df,stringColName,separator)
	for (subColValue in uniqueValues){
	df <- addBoolColForAnswerOption(df,stringColName,subColValue,separator)
	}
	df
	}

	# can be used like so:
	survey_results_public <- read_csv("C:/Users/jan/Downloads/survey_results_public.csv") # https://insights.stackoverflow.com/survey

	so_forAnalysis <- survey_results_public %>%
	stringColToBoolMatches(stringColName = "LearnCode",separator = ";")%>%
	stringColToBoolMatches(stringColName = "LearnCodeOnline", separator = ";")%>%
	stringColToBoolMatches(stringColName = "LanguageHaveWorkedWith", separator = ";")%>%
	stringColToBoolMatches(stringColName = "CodingActivities", separator = ";")%>%
	select(-ends_with("_NA")) # NAs are currently put in their own column, I guess cause they are cast to strings. Might fix at some point.