Created
May 13, 2017 18:55
-
-
Save cecilesauder/37c39bff860846dedea4bce91a70a87f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
#si tu l'as pas : install.packages("dplyr") | |
#lecture des fichiers | |
data_train <- read.csv(file = "/home/cecile/Documents/R/train.csv") | |
data_test <- read.csv(file = "/home/cecile/Documents/R/test.csv") | |
#glimpse c'est juste une autre sorte de summary et le %>% si tu connais pas c'est la syntaxe du package dplyr | |
#c'est exactement comme ecrire glimpse(data_train) sauf que cette syntaxe ecrit sujet %>% verbe au lieu de verbe(sujet) | |
#c'est plus lisible et plus fashion lol | |
data_train %>% glimpse() | |
data_train %>% select_if(is.factor) %>% glimpse() | |
data_train %>% summary() | |
data_test %>% summary() | |
#fonction pour tester si une colonne a moins de 200 NA | |
less_NA <- function(col){ | |
sum(is.na(col)) < 200 | |
} | |
#fonction pour tester si une colonne a plus de 200 NA | |
more_NA <- function(col){ | |
sum(is.na(col)) > 200 | |
} | |
#selection que les colonne qui ont moins de 200 NA | |
data_train2 <- data_train %>% select_if(less_NA) | |
data_test2 <- data_test %>% select_if(less_NA) | |
mod <- lm(SalePrice ~ . , data= data_train2)#plus d'erreur | |
#pour voir quelles colonnes posent probleme du coup | |
data_train %>% select_if(more_NA) %>% names |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment