Last active
          January 31, 2019 11:47 
        
      - 
      
 - 
        
Save davidgohel/91130b9c77346fa7d2b5b396f434054d to your computer and use it in GitHub Desktop.  
    bench pour compter le nombre de NA en lignes
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | library(purrr) | |
| library(charlatan) | |
| library(microbenchmark) | |
| create_fake_data <- function(n){ | |
| data <- ch_generate('job', 'color_name', 'phone_number', n = n) | |
| data$color_name[sample.int(nrow(data), n/10 )] <- NA_character_ | |
| data$x <- rnorm(n) | |
| data$x[sample.int(nrow(data), n/10 )] <- NA_real_ | |
| data$y <- rpois(n = n, lambda = 12) | |
| data$y[sample.int(nrow(data), n/10 )] <- NA_real_ | |
| data | |
| } | |
| prop0 <- function(dataset){ | |
| rowSums(is.na(dataset)) | |
| } | |
| prop1 <- function(dataset){ | |
| # probleme de cast ; quand les données contiennent des colonnes de types differents | |
| # il y aura changement de type - les NA seront castes correctement en NA mais | |
| # surement que cela fait une operation non necessaire | |
| apply(dataset, MARGIN = 1, function(x){ | |
| sum(is.na(x)) | |
| }) | |
| } | |
| prop2 <- function(dataset){ | |
| # probleme, l'allocation d'un nouveau vecteur | |
| # pour chaque ligne va deteriorer les performances + cast | |
| # quand colonnes de differents types + purrr fait quelques | |
| # operations de plus que sapply | |
| pmap_dbl(dataset, function(...){sum(is.na(c(...)))}) | |
| } | |
| prop3 <- function(dataset){ | |
| # probleme de cette solution, l'allocation d'un nouveau vecteur | |
| # pour chaque ligne va deteriorer les performances + cast | |
| # quand colonnes de differents types | |
| dataset <- as.list(dataset) | |
| dataset$FUN <- function(...){sum(is.na(c(...)))} | |
| dataset$SIMPLIFY <- TRUE | |
| dataset$USE.NAMES = FALSE | |
| do.call(mapply, dataset) | |
| } | |
| prop4 <- function(dataset){ | |
| # moi j'aime bien celle la | |
| Reduce(`+`, lapply(dataset, is.na) ) | |
| } | |
| prop5 <- function(df) { | |
| # celle de Florian - proche de la precedente | |
| nb_na <- integer(nrow(df)) | |
| for (var in df) { | |
| nb_na <- nb_na + is.na(var) | |
| } | |
| nb_na | |
| } | |
| bench_that <- function(x){ | |
| message("Le dataset contient ", nrow(x), " lignes") | |
| bench <- microbenchmark(times = 50L, | |
| `Reduce sur colonnes` = prop4(x), | |
| `rowSums(...)` = prop0(x), | |
| `apply(..., MARGIN = 1)` = prop1(x), | |
| `sapply on one vector per row` = prop3(x), | |
| `purrr on one vector per row` = prop2(x), | |
| `florian` = prop5(x), | |
| `naniar:::n_miss_row(data1)` = naniar:::n_miss_row(x) | |
| ) | |
| print(bench) | |
| bench | |
| } | |
| data1 <- create_fake_data(1000) | |
| data2 <- create_fake_data(10000) | |
| data3 <- create_fake_data(100000) | |
| b1 <- bench_that(data1) | |
| b2 <- bench_that(data2) | |
| b3 <- bench_that(data3) | |
| benchnames <- c("b1", "b2", "b3") | |
| benchbars <- c("bar1", "bar2", "bar3") | |
| bench <- Reduce(rbind, lapply(benchnames, | |
| function(x) { | |
| obj <- get(x) | |
| data.frame(expr = obj$expr, time = obj$time/1000, bench = x) | |
| } ) ) | |
| tab <- with(bench, tapply(time, list(expr = expr, bench = bench), median, na.rm = TRUE)) | |
| print(tab) | 
If using prop5, you won't face the following issue:
> Reduce(`+`, lapply(iris[0], is.na) )
NULL
> prop5(iris[0])
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [57] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[113] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
    
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment
  
            
Reduce(+, lapply(dataset, is.na) )is the faster way to count NA per rows. Florian for loop is almost the same.