Skip to content

Instantly share code, notes, and snippets.

@joffilyfe
Last active July 27, 2017 12:16
Show Gist options
  • Save joffilyfe/2fe74324746dd6f7116bc3b7c00f6d1f to your computer and use it in GitHub Desktop.
Save joffilyfe/2fe74324746dd6f7116bc3b7c00f6d1f to your computer and use it in GitHub Desktop.
library(tidyverse)
library(lubridate)
library(dplyr)
set.seed(1000)
wssplot <- function(data, nc=15, seed=1000) {
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)
}
plot(1:nc, wss, type="b", xlab="Number of Clusters", ylab="Within groups sum of squares")
}
dataset <- read_csv2("data/chik2016.csv")
summary(dataset)
# Replace NA values to 0 in FEBRE COL
dataset <- dataset %>%
mutate(FEBRE = ifelse(is.na(FEBRE), 0, FEBRE)) %>%
mutate(CEFALEIA = ifelse(is.na(CEFALEIA), 0, CEFALEIA)) %>%
mutate(MIALGIA = ifelse(is.na(MIALGIA), 0, MIALGIA)) %>%
mutate(EXANTEMA = ifelse(is.na(EXANTEMA), 0, EXANTEMA)) %>%
mutate(VOMITO = ifelse(is.na(VOMITO), 0, VOMITO)) %>%
mutate(NAUSEA = ifelse(is.na(NAUSEA), 0, NAUSEA)) %>%
mutate(DOR_COSTAS = ifelse(is.na(DOR_COSTAS), 0, DOR_COSTAS)) %>%
mutate(CONJUNTVIT = ifelse(is.na(CONJUNTVIT), 0, CONJUNTVIT)) %>%
mutate(ARTRITE = ifelse(is.na(ARTRITE), 0, ARTRITE)) %>%
subset(., !is.na(DT_NASC))
dataset$AGE <- year("2017-07-27") - year(ymd(x$DT_NASC))
# Select columns to try do any clustering action
grupo <- dataset %>% select(FEBRE, CEFALEIA, MIALGIA, EXANTEMA, VOMITO, NAUSEA, DOR_COSTAS, CONJUNTVIT, ARTRITE, AGE)
ratio_ss <- rep(0, 7)
for (k in 1:7) {
grupo_km <- kmeans(grupo, k, nstart = 20)
ratio_ss[k] <- grupo_km$tot.withinss / grupo_km$totss
}
plot(ratio_ss, type = "b", xlab = "k")
grupo_km <- kmeans(grupo, 2, nstart = 20)
plot(grupo$DOR_COSTAS, grupo$VOMITO, col = grupo_km$cluster)
filtrados <- grupo %>%
filter(., FEBRE > 0) %>%
filter(., CEFALEIA > 0) %>%
filter(., MIALGIA > 0)
ratio_ss <- rep(0, 12)
for (k in 1:12) {
filtrados_km <- kmeans(filtrados, k, nstart = 20)
ratio_ss[k] <- filtrados_km$tot.withinss / filtrados_km$totss
}
wssplot(filtrados, seed=1000)
# Mostra qual a quantidade de grupos seria importante informar ao algoritmo
plot(ratio_ss, type = "b", xlab = "k")
filtrados_km <- kmeans(filtrados, 6, nstart = 20)
filtrados_km
plot(filtrados$NAUSEA, filtrados$AGE, col = filtrados_km$cluster)
table(filtrados_km$cluster, filtrados$FEBRE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment