Last active
January 31, 2019 11:47
-
-
Save davidgohel/91130b9c77346fa7d2b5b396f434054d to your computer and use it in GitHub Desktop.
bench pour compter le nombre de NA en lignes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(purrr) | |
library(charlatan) | |
library(microbenchmark) | |
create_fake_data <- function(n){ | |
data <- ch_generate('job', 'color_name', 'phone_number', n = n) | |
data$color_name[sample.int(nrow(data), n/10 )] <- NA_character_ | |
data$x <- rnorm(n) | |
data$x[sample.int(nrow(data), n/10 )] <- NA_real_ | |
data$y <- rpois(n = n, lambda = 12) | |
data$y[sample.int(nrow(data), n/10 )] <- NA_real_ | |
data | |
} | |
prop0 <- function(dataset){ | |
rowSums(is.na(dataset)) | |
} | |
prop1 <- function(dataset){ | |
# probleme de cast ; quand les données contiennent des colonnes de types differents | |
# il y aura changement de type - les NA seront castes correctement en NA mais | |
# surement que cela fait une operation non necessaire | |
apply(dataset, MARGIN = 1, function(x){ | |
sum(is.na(x)) | |
}) | |
} | |
prop2 <- function(dataset){ | |
# probleme, l'allocation d'un nouveau vecteur | |
# pour chaque ligne va deteriorer les performances + cast | |
# quand colonnes de differents types + purrr fait quelques | |
# operations de plus que sapply | |
pmap_dbl(dataset, function(...){sum(is.na(c(...)))}) | |
} | |
prop3 <- function(dataset){ | |
# probleme de cette solution, l'allocation d'un nouveau vecteur | |
# pour chaque ligne va deteriorer les performances + cast | |
# quand colonnes de differents types | |
dataset <- as.list(dataset) | |
dataset$FUN <- function(...){sum(is.na(c(...)))} | |
dataset$SIMPLIFY <- TRUE | |
dataset$USE.NAMES = FALSE | |
do.call(mapply, dataset) | |
} | |
prop4 <- function(dataset){ | |
# moi j'aime bien celle la | |
Reduce(`+`, lapply(dataset, is.na) ) | |
} | |
prop5 <- function(df) { | |
# celle de Florian - proche de la precedente | |
nb_na <- integer(nrow(df)) | |
for (var in df) { | |
nb_na <- nb_na + is.na(var) | |
} | |
nb_na | |
} | |
bench_that <- function(x){ | |
message("Le dataset contient ", nrow(x), " lignes") | |
bench <- microbenchmark(times = 50L, | |
`Reduce sur colonnes` = prop4(x), | |
`rowSums(...)` = prop0(x), | |
`apply(..., MARGIN = 1)` = prop1(x), | |
`sapply on one vector per row` = prop3(x), | |
`purrr on one vector per row` = prop2(x), | |
`florian` = prop5(x), | |
`naniar:::n_miss_row(data1)` = naniar:::n_miss_row(x) | |
) | |
print(bench) | |
bench | |
} | |
data1 <- create_fake_data(1000) | |
data2 <- create_fake_data(10000) | |
data3 <- create_fake_data(100000) | |
b1 <- bench_that(data1) | |
b2 <- bench_that(data2) | |
b3 <- bench_that(data3) | |
benchnames <- c("b1", "b2", "b3") | |
benchbars <- c("bar1", "bar2", "bar3") | |
bench <- Reduce(rbind, lapply(benchnames, | |
function(x) { | |
obj <- get(x) | |
data.frame(expr = obj$expr, time = obj$time/1000, bench = x) | |
} ) ) | |
tab <- with(bench, tapply(time, list(expr = expr, bench = bench), median, na.rm = TRUE)) | |
print(tab) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
If using
prop5
, you won't face the following issue: