Created
August 2, 2021 16:58
-
-
Save kylebutts/23831709a6def3843caf901630bed08b to your computer and use it in GitHub Desktop.
Factors to sparse matrix efficiently
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #' Creates sparse 0/1 matrix for factor variables | |
| #' | |
| #' @param df dataframe containing factor columns that will be turned into sparse matrix. | |
| #' Note that the variables aren't required to be factor variables themselves. | |
| #' Make sure not to include other variables in df | |
| fact_to_sparse <- function(df) { | |
| # Convert to factor variables | |
| df[,names(df)] <- lapply(df[,names(df)] , factor) | |
| # Number of levels for each variable | |
| nlevels <- sapply(df, nlevels) | |
| # Rows | |
| i <- rep(seq_len(n), ncol(df)) | |
| # Columns | |
| j <- | |
| # converts factor to number for each variable | |
| unlist(lapply(df, as.integer)) + | |
| # and shifts that number by the number of levels from previous variables | |
| rep(cumsum(c(0, head(nlevels, -1))), each = n) | |
| # Value = 1 | |
| x <- 1 | |
| Matrix::sparseMatrix(i = i, j = j, x = x) | |
| } | |
| # ---- Example -------------------------------- | |
| n <- 1e6 | |
| df <- data.frame( | |
| x = factor(sample(c("A", "B", "C"), n, TRUE)), | |
| y = factor(sample(c("D", "E"), n, TRUE)) | |
| ) | |
| mat <- fact_to_sparse(df) | |
| # size comparison | |
| pryr::object_size(mat) | |
| pryr::object_size(matrix(mat)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment