Created
March 2, 2018 15:18
-
-
Save Maes95/d061ea744db963fb371dc6bd55ca1e3a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(magrittr) | |
data <- read.csv("adult.csv", header = FALSE, sep = ",") | |
colnames(data) <- c("Age", | |
"Workclass", | |
"Fnlwgt", | |
"Education", | |
"EducationNum", | |
"MaritalStatus", | |
"Occupation", | |
"Relationship", | |
"Race", | |
"Sex", | |
"CapitalGain", | |
"CapitalLoss", | |
"HoursPerWeek", | |
"NativeCountry", | |
"Income" | |
) | |
levels(data$Income)[1] <- FALSE | |
levels(data$Income)[2] <- TRUE | |
head(data,100) | |
# Sex -> No faltantes | Doble de hombres que mujeres (1/3 vs 2/3) | |
# El 10% de las mujeres superan los 50K | |
# El 30% de los hombres superan los 50K | |
levels(data$Sex) | |
plot(data['Sex']) | |
# CapitalGain | |
# -> 91% de los datos son 0 | |
no_0_cg <- data %>% filter(CapitalGain > 0) %>% filter(Income == TRUE) | |
# -> Del 9% (1677 -> Income==True, 1035 -> Icome == False) | |
plot(data$CapitalGain) | |
# CapitalLoss | |
# -> 95% de los datos son 0 | |
no_0_cl <- data %>% filter(CapitalLoss > 0) %>% filter(Income == TRUE) | |
# -> Del 5% (773 -> Income==True, 746 -> Icome == False) | |
plot(data$CapitalLoss) | |
# HoursPerWeek | |
# -> El 46% de los resultados son exactamente 40 horas/semana (la moda) | |
hist(data$HoursPerWeek) | |
# NativeCountry | |
# -> 517 personas no tienen pais | |
# -> Ajustamos para obtener un grupo con otro nombre | |
levels(data$NativeCountry)[levels(data$NativeCountry)=="?"] <- "Other" | |
plot(data$NativeCountry) | |
# -> Tenemos un total de 40 paises + 1 grupo sin pais | |
# -> Agrupamos los que tienen el valor "South" en "NativeCountry" a Asia, puesto que la mayoría son de raza asiática | |
# -> Agrupamos en 5 Áreas geográficas y 1 de "Otros" | |
# -> Los grupos de otros tienen razas variadas, no es posible localizarlos | |
data$Continent[data$NativeCountry %in% c("Cuba", "Dominican-Republic", "Mexico", "Honduras", "Nicaragua", "El-Salvador", "Guatemala", "Jamaica","Puerto-Rico", "Trinadad&Tobago", "Haiti","Ecuador")] <- "CentralAmerica" | |
data$Continent[data$NativeCountry %in% c("United-States","Canada", "Outlying-US(Guam-USVI-etc)")] <- "NorthAmerica" | |
data$Continent[data$NativeCountry %in% c("Columbia", "Peru")] <- "SouthAmerica" | |
data$Continent[data$NativeCountry %in% c("France","Holand-Netherlands","Germany","Ireland", "Poland", "Yugoslavia", "Greece", "Italy", "Portugal", "Hungary","England", "Scotland")] <- "Europe" | |
data$Continent[data$NativeCountry %in% c("Iran","Laos", "Vietnam","Cambodia","Taiwan", "Hong", "Thailand","China","India","Japan", "Philippines", "South")] <- "Asia" | |
data$Continent[data$NativeCountry %in% c("Other")] <- "Other" | |
data$Continent %<>% factor |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment