Skip to content

Instantly share code, notes, and snippets.

@Maes95
Created March 2, 2018 15:18
Show Gist options
  • Save Maes95/d061ea744db963fb371dc6bd55ca1e3a to your computer and use it in GitHub Desktop.
Save Maes95/d061ea744db963fb371dc6bd55ca1e3a to your computer and use it in GitHub Desktop.
library(dplyr)
library(magrittr)
data <- read.csv("adult.csv", header = FALSE, sep = ",")
colnames(data) <- c("Age",
"Workclass",
"Fnlwgt",
"Education",
"EducationNum",
"MaritalStatus",
"Occupation",
"Relationship",
"Race",
"Sex",
"CapitalGain",
"CapitalLoss",
"HoursPerWeek",
"NativeCountry",
"Income"
)
levels(data$Income)[1] <- FALSE
levels(data$Income)[2] <- TRUE
head(data,100)
# Sex -> No faltantes | Doble de hombres que mujeres (1/3 vs 2/3)
# El 10% de las mujeres superan los 50K
# El 30% de los hombres superan los 50K
levels(data$Sex)
plot(data['Sex'])
# CapitalGain
# -> 91% de los datos son 0
no_0_cg <- data %>% filter(CapitalGain > 0) %>% filter(Income == TRUE)
# -> Del 9% (1677 -> Income==True, 1035 -> Icome == False)
plot(data$CapitalGain)
# CapitalLoss
# -> 95% de los datos son 0
no_0_cl <- data %>% filter(CapitalLoss > 0) %>% filter(Income == TRUE)
# -> Del 5% (773 -> Income==True, 746 -> Icome == False)
plot(data$CapitalLoss)
# HoursPerWeek
# -> El 46% de los resultados son exactamente 40 horas/semana (la moda)
hist(data$HoursPerWeek)
# NativeCountry
# -> 517 personas no tienen pais
# -> Ajustamos para obtener un grupo con otro nombre
levels(data$NativeCountry)[levels(data$NativeCountry)=="?"] <- "Other"
plot(data$NativeCountry)
# -> Tenemos un total de 40 paises + 1 grupo sin pais
# -> Agrupamos los que tienen el valor "South" en "NativeCountry" a Asia, puesto que la mayoría son de raza asiática
# -> Agrupamos en 5 Áreas geográficas y 1 de "Otros"
# -> Los grupos de otros tienen razas variadas, no es posible localizarlos
data$Continent[data$NativeCountry %in% c("Cuba", "Dominican-Republic", "Mexico", "Honduras", "Nicaragua", "El-Salvador", "Guatemala", "Jamaica","Puerto-Rico", "Trinadad&Tobago", "Haiti","Ecuador")] <- "CentralAmerica"
data$Continent[data$NativeCountry %in% c("United-States","Canada", "Outlying-US(Guam-USVI-etc)")] <- "NorthAmerica"
data$Continent[data$NativeCountry %in% c("Columbia", "Peru")] <- "SouthAmerica"
data$Continent[data$NativeCountry %in% c("France","Holand-Netherlands","Germany","Ireland", "Poland", "Yugoslavia", "Greece", "Italy", "Portugal", "Hungary","England", "Scotland")] <- "Europe"
data$Continent[data$NativeCountry %in% c("Iran","Laos", "Vietnam","Cambodia","Taiwan", "Hong", "Thailand","China","India","Japan", "Philippines", "South")] <- "Asia"
data$Continent[data$NativeCountry %in% c("Other")] <- "Other"
data$Continent %<>% factor
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment