Last active
January 20, 2024 16:42
-
-
Save jdittrich/f80a3ebbb18ff993fd22fd1199b8d40b to your computer and use it in GitHub Desktop.
Data processing for surveys with multiple choice answers put in a column with seperated string-values like "R;Python;C"
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Suvery tools often put multiple choice questions’s answers as a single value with concatenated strings: | |
# If asking which programming languages from a list people use, tools will output "C;Python;R" if users selected these options. | |
# These will be put in a column representing the question’s answers. However, this format is hard to work with. | |
# | |
# The code below creates a %>%-able function "stringColToBoolMatches" | |
# | |
# parameters: df <tibble>, stringColName<character>, name of the column with the answers, separator<character> e.g. ";" | |
# returning: a new dataframe that keeps the original column but expands its values to boolean columns, one for each multiple choice option | |
# | |
# the columns are named after the unique answer option and can be NA (original value was NA), True (value present) or False (value not present) | |
# e.g. if the column "lang" had a value like "C;Python;R", in the same row the columns "lang_C", "lang_Python", "lang_R" will be True, | |
# but "lang_Java", "lang_go", etc. will be False. | |
library(tidyverse) | |
# parameters: df <tibble>, columnName<character>, separator<character> | |
# returns: <character> vector | |
getUniqueValuesFromStringColumn <- function(df,columnName,separator){ | |
column <- df[[columnName]] | |
paste(column, collapse = separator) %>% | |
strsplit(split=separator) %>% | |
unlist %>% | |
unique%>% | |
sort(decreasing = TRUE) | |
} | |
# parameters: answersValue<character> e.g. "C; python; Java", optionToCheck<character> e.g. "Java", separator<character> e.g. ";" | |
# returns: <boolean> | |
isInAnswerOptions<- function(answersValue,optionToCheck,separator){ | |
if(is.na(answersValue)) return(NA) | |
answerList <- strsplit(answersValue,split=separator) | |
optionToCheck %in% unlist(answerList) #unlist cause %in% needs vector | |
} | |
# parameters: df <tibble>, stringColName<character>, name of the column with the answers, answerOption<character> e.g. "Java", separator<character> e.g. ";" | |
# returns: <tibble> | |
addBoolColForAnswerOption <- function(df,stringColName,answerOption,separator){ | |
stringCol <- df[[stringColName]] | |
inAnswerOptionsPrefilled <- partial(isInAnswerOptions, optionToCheck=answerOption, separator=separator) | |
newdf <- mutate(df,"{stringColName}_{answerOption}":=map_lgl(stringCol,inAnswerOptionsPrefilled), .after=ends_with(stringColName)) | |
} | |
# df <tibble>, stringColName<character>, name of the column with the answers, separator<character> e.g. ";" | |
# returns: <tibble> | |
stringColToBoolMatches <- function(df,stringColName,separator){ | |
uniqueValues <- getUniqueValuesFromStringColumn(df,stringColName,separator) | |
for (subColValue in uniqueValues){ | |
df <- addBoolColForAnswerOption(df,stringColName,subColValue,separator) | |
} | |
df | |
} | |
# can be used like so: | |
survey_results_public <- read_csv("C:/Users/jan/Downloads/survey_results_public.csv") # https://insights.stackoverflow.com/survey | |
so_forAnalysis <- survey_results_public %>% | |
stringColToBoolMatches(stringColName = "LearnCode",separator = ";")%>% | |
stringColToBoolMatches(stringColName = "LearnCodeOnline", separator = ";")%>% | |
stringColToBoolMatches(stringColName = "LanguageHaveWorkedWith", separator = ";")%>% | |
stringColToBoolMatches(stringColName = "CodingActivities", separator = ";")%>% | |
select(-ends_with("_NA")) # NAs are currently put in their own column, I guess cause they are cast to strings. Might fix at some point. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment