Last active
November 17, 2022 16:46
-
-
Save samuelsaari/4b198274aabadf2f6061080973789d7c to your computer and use it in GitHub Desktop.
Latent Class Growth Modelling with multinomial response in R - attempt to solve
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#------------------------------------------------------------------------------------------------- | |
# libraries | |
rm(list=ls()) | |
# install.packages("TraMineR") # example data | |
# install.packages("OpenRepGrid")# random words for headings | |
# install.packages("khroma") # color palletttes | |
# install.packages("tidyverse") | |
# install.packages("modeldb") | |
# install.packages("car") | |
# install.packages("jtools") # for a nice theme | |
library(flexmix) # GMM & LCGM | |
library(TraMineR) # example data | |
library(OpenRepGrid)# random words for headings | |
library(khroma) # color palletttes | |
library(tidyverse) | |
#library(modeldb) | |
library(car) | |
library(jtools) # for a nice theme | |
library(foreach) | |
library(ggpubr) | |
theme_set(theme_nice(base_family = 'Consolas' )) | |
# parameters | |
TEST_RUN=T | |
set.seed(102) | |
# loading and wrangling data | |
data(biofam) | |
biofam | |
biofam <- biofam %>% rename(gender=sex) | |
d <- biofam %>% select(.,gender,starts_with('a'),p02r01) | |
d$id <- 1:nrow(d) | |
d <- d %>% pivot_longer(.,names_to = "age",cols = starts_with('a'),values_to="rel_stat") | |
d | |
d <- d %>% mutate(across(c(gender,rel_stat), as.factor)) | |
d <- d %>% transform(age=str_replace(age,"a","")) | |
d <- d %>% mutate(age=as.integer(age)) | |
d <- tibble(d) | |
#d <- d %>% mutate(age=age-14) | |
d | |
# 1 single | |
# 2 married | |
# 3 child | |
# 4 divorced | |
d$rel_stat <- car::recode(d$rel_stat,"0=1;1=1;2=2;3=2;4=3;5=3;6=3;7=4") | |
#d2 <- d %>% modeldb::add_dummy_variables(rel_stat,auto_values = T) | |
# making alternative ways to run the model | |
if (TEST_RUN) { | |
d <- d %>% filter(p02r01 %in% c("no denomination or religion")) | |
nr_of_classes <- 3 | |
} else { | |
nr_of_classes <- 4 | |
} | |
vector_of_chosen_classes <- 1:nr_of_classes | |
levels_rel_stat <- 1:4 # note that this is defined manually | |
AGELOW=15 | |
AGEHIGH=30 | |
NR_OF_YEARS_VECTOR <- AGELOW:AGEHIGH | |
NR_OF_YEARS=length(NR_OF_YEARS_VECTOR) | |
#---------------------------------------------------------------------------------- | |
# running Latent class growth modelling | |
#lcgm_formula <- as.formula(rel_stat~age + I(age^2) + gender + gender:age) | |
lcgm_formula <- as.formula(rel_stat~ age + I(age^2) +gender + age:gender + I(age^2):gender) | |
lcgm <- flexmix::flexmix(.~ .| id, | |
data=d, | |
k=nr_of_classes, # would be 1:12 in real analysis | |
#nrep=1, # would be 50 in real analysis to avoid local maxima (and we would use the stepFlexmix function instead) | |
control = list(iter.max = 500, minprior = 0), | |
model = flexmix::FLXMRmultinom(lcgm_formula)) | |
#--------------------------------------------------------------------------------- | |
# finding parameters | |
(parameter_list <- purrr::map(lcgm@components, function(x) x[[1]]@parameters$coef)) | |
# adding class to original data | |
d$class_orig <- lcgm@cluster | |
#----------------------------------------------------------------------------- | |
# Changing the order of the classes - for data | |
does_custom_order_matrix_exist <- ifelse(nr_of_classes==3,T,F) | |
if (does_custom_order_matrix_exist) { | |
(order_matrix <- matrix(c(1,3, #the number indicates where the classes were before... | |
2,1, #the order indicates were we want to have them a.k.a. new classes | |
3,2), | |
ncol=2,byrow=T)) | |
} else { | |
order_matrix <- matrix(cbind(rep(vector_of_chosen_classes,2)),ncol=2,byrow=F) | |
} | |
order_matrix | |
# helper function to swap the order of the classes | |
recode_class <- function(ORIG_CLASS,IS_MALE) { | |
column <- ifelse(IS_MALE,1,2) | |
vector_of_chosen_classes[ match(ORIG_CLASS,order_matrix[,column]) ] # logic: c(1,2,3)[c(T,F,F)]=1 | |
} | |
# change order of classses | |
d <- d %>% mutate(.,class=ifelse(gender=="man",recode_class(class_orig,IS_MALE=TRUE),recode_class(class_orig,IS_MALE=FALSE))) | |
#----------------------------------------------------------------------------- | |
# Changing the order of the classes - for parameters and predictions | |
stratify_coefs_by_gender_helper <- function(COEF_MATRIX,WOMAN_INDICATOR) { | |
intercept <- COEF_MATRIX[,"(Intercept)"]+WOMAN_INDICATOR*COEF_MATRIX[,"genderwoman"] | |
age <- COEF_MATRIX[,"age"]+WOMAN_INDICATOR*COEF_MATRIX[,"age:genderwoman"] | |
age2 <- COEF_MATRIX[,"I(age^2)"] +WOMAN_INDICATOR*COEF_MATRIX[,"I(age^2):genderwoman"] | |
t(cbind(intercept,age,age2)) | |
} | |
stratify_coefs_by_gender <- function(COEF_MATRIX) { | |
list("male_coefs"=stratify_coefs_by_gender_helper(COEF_MATRIX,0), "female_coefs"=stratify_coefs_by_gender_helper(COEF_MATRIX,1)) | |
} | |
# after this, order original, but seperate estimates for men/women | |
parameter_list_by_gender_orig_order <- purrr::map(parameter_list,stratify_coefs_by_gender) | |
parameter_list_by_gender_orig_order | |
# making a copy of the parameter list that is stratified by gender | |
parameter_list_by_gender <- parameter_list_by_gender_orig_order | |
# substituting new values based on the order_matrix (defined above) | |
for (j in vector_of_chosen_classes) { | |
for (i in 1:2) { | |
class_name=paste0('class',i) | |
coef_gender <- ifelse(i==1,"male_coefs","female_coefs") | |
column <- ifelse(i==1,1,2) | |
new_index <- which(order_matrix[,column]==j) | |
parameter_list_by_gender[[new_index]][[i]] <- parameter_list_by_gender_orig_order[[j]][[i]] | |
} | |
} | |
parameter_list_by_gender_orig_order | |
parameter_list_by_gender # now the order is changed | |
# we will predict the values based on all combinations of age, age^2 and gender | |
prediction_dataframe <- expand.grid(c=1,age=NR_OF_YEARS_VECTOR) # gender is included implicitly (coef_matrices different for both genders) | |
prediction_dataframe$age2 <- prediction_dataframe$age^2 | |
(prediction_matrix <- as.matrix(prediction_dataframe)) | |
predict_one_gender <- function(COEF_MATRIX,PREDICTION_MATRIX=prediction_matrix) { | |
#add vector for the number of rel_stats!!!!!!!!!!!!!!! Obs nb! | |
# https://www.ibm.com/support/pages/compute-predicted-probabilities-multinomial-logistic-new-cases-or-outside-spss | |
predictions <- PREDICTION_MATRIX %*% COEF_MATRIX | |
predictions <- exp(predictions) | |
predictions <- cbind(1,predictions) # adding ref category | |
predictions <- cbind(predictions,rowSums(predictions)) | |
colnames(predictions) <- c(levels_rel_stat,"row_sums") | |
probabilities <- apply(predictions[,levels_rel_stat],2,function(x) x/predictions[,"row_sums"] ) | |
return(probabilities) | |
} | |
predict_one_class <- function(COEF_MATRIX_LIST,PREDICTION_MATRIX=prediction_matrix) { | |
male_probabilities <- predict_one_gender(COEF_MATRIX_LIST$male_coefs) | |
female_probabilities <- predict_one_gender(COEF_MATRIX_LIST$female_coefs) | |
return(list("male_probabilities"=male_probabilities,"female_probabilities"=female_probabilities)) | |
} | |
prediction_by_class_and_gender <- purrr::map(parameter_list_by_gender,predict_one_class) | |
# now we have all the probabilities we need, and will add some info and shape the data | |
add_gender_to_prediction_matrix <- function(my_list) { | |
male_probabilities <- cbind(my_list$male_probabilities,gender=1) | |
female_probabilities <- cbind(my_list$female_probabilities,gender=2) | |
return(list("male_probabilities"=male_probabilities,"female_probabilities"=female_probabilities)) | |
} | |
combine_male_and_female_prediction <- function(my_list){ | |
my_list <- purrr::map(my_list,function(x) cbind(x,age=NR_OF_YEARS_VECTOR)) # add age | |
my_list <- add_gender_to_prediction_matrix(my_list) | |
combined_matrix <- do.call(rbind,my_list) | |
return(combined_matrix) | |
} | |
prediction_by_class <- purrr::map(prediction_by_class_and_gender, | |
combine_male_and_female_prediction) | |
pivot_predictions_to_long <- function(combined_matrix) { | |
combined_tibble <- setNames(as_tibble(combined_matrix,.name_repair = "minimal"),c(levels_rel_stat,"age","gender")) | |
long_matrix <- pivot_longer(data=combined_tibble, | |
cols = -all_of(c("age","gender")), | |
names_to = "rel_stat", | |
values_to = "probability") | |
return(long_matrix) | |
} | |
prediction_tibbles_long <- purrr::map(prediction_by_class, | |
pivot_predictions_to_long) | |
# now we have the predicted data in the right format. | |
#-------------------------------------------------------------------------------------------------- | |
# let us confirm that the results would be identical if we fitted the data automatically and not manually as above. | |
# The order will be different | |
fitted_lcgm <- fitted(lcgm) | |
fitted_tibbles <- lapply(fitted_lcgm, function(x) cbind(x,d$age,d$gender)) | |
fitted_tibbles <- lapply(fitted_tibbles,function(x) setNames(as_tibble(x,.name_repair = "minimal"),c(levels_rel_stat,"age","gender") )) | |
fitted_tibbles_long <-purrr::map(fitted_tibbles, function(x) { | |
pivot_longer(data=x,cols = -all_of(c("age","gender")), names_to = "rel_stat", values_to = "probability") | |
} ) | |
fitted_tibbles_long <- purrr::map(fitted_tibbles_long,distinct) # remove duplicate rows | |
# helpers for plotting | |
text_size <- 12 | |
title_size <- text_size*1.2 | |
line_size=1.6 | |
line_size_legend <- line_size*1.75 | |
gender_labels <- c(`1`="\U2642",`2`="\U2640") | |
rw <- function() { | |
randomWords(nr_of_classes) | |
} | |
(class_titles <- paste(vector_of_chosen_classes,"class:",rw(),rw(),rw())) | |
#-------------------------------------------------------------------------------- | |
# plotting | |
plot_single_class_line <- function(CLASS_DATA,TITLE){ | |
ggplot(CLASS_DATA, aes(x = age,y = probability,color=rel_stat)) + | |
geom_line(linewidth=line_size) + | |
scale_color_vibrant() + | |
guides(color = guide_legend(reverse = TRUE)) + | |
# common for both plots + | |
ggtitle(TITLE) + | |
labs (x = NULL, y=NULL) + | |
facet_wrap(~ gender,strip.position="left",labeller=labeller(gender=gender_labels))+ | |
theme(text = element_text(size=text_size), | |
#legend.key.size = unit(1,"pt"), | |
legend.title=element_blank(), | |
plot.title = element_text(size =title_size,margin=margin(b=5)), | |
strip.text.y.left= element_text(angle=0, size=text_size*1.6)) | |
} | |
#plot_single_class_line(fitted_tibbles_long[[1]],TITLE=class_titles[1]) | |
# saving the plots | |
class_plot_list_line<- purrr::map(vector_of_chosen_classes,function(i) plot_single_class_line(fitted_tibbles_long[[i]],TITLE=class_titles[i])) | |
class_plot_list_line_prediction <- purrr::map(vector_of_chosen_classes,function(i) plot_single_class_line(prediction_tibbles_long[[i]],TITLE=class_titles[i])) | |
# arranging the plot lists with ggarrange | |
(plot_lcgm_line <- do.call(ggpubr::ggarrange,c(class_plot_list_line, list(common.legend = TRUE, legend = "bottom",ncol=1)))) | |
(plot_lcgm_line <- do.call(ggpubr::ggarrange,c(class_plot_list_line, list(common.legend = TRUE, legend = "bottom",ncol=1)))) | |
ggpubr::annotate_figure(plot_lcgm_line, top =text_grob("Original model", size = 20,face="bold")) | |
(plot_lcgm_line_prediction <- do.call(ggpubr::ggarrange,c(class_plot_list_line_prediction, list(common.legend = TRUE, legend = "bottom",ncol=1)))) | |
ggpubr::annotate_figure(plot_lcgm_line_prediction, top =text_grob("Label switching corrected", size = 20,face="bold")) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment