Created
April 6, 2015 20:06
-
-
Save jalapic/2b9d7e4b9e8b7119d965 to your computer and use it in GitHub Desktop.
babynames MDS
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### load packages | |
library(babynames) | |
library(dplyr) | |
library(tidyr) | |
library(ggplot2) | |
library(gridExtra) | |
library(magrittr) | |
head(babynames) | |
tail(babynames) | |
g1 <- babynames %>% | |
filter(name=="Barbara") %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=sex), lwd=1) + | |
scale_color_manual(values = c("firebrick1", "dodgerblue")) + | |
theme_bw() + | |
ggtitle("Barbara") | |
g2 <- babynames %>% | |
filter(name=="Megan") %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=sex), lwd=1) + | |
scale_color_manual(values = c("firebrick1", "dodgerblue")) + | |
theme_bw() + | |
ggtitle("Megan") | |
g3 <- babynames %>% | |
filter(name=="Jennifer") %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=sex), lwd=1) + | |
scale_color_manual(values = c("firebrick1", "dodgerblue")) + | |
theme_bw() + | |
ggtitle("Jennifer") | |
g4 <- babynames %>% | |
filter(name=="Irene") %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=sex), lwd=1) + | |
scale_color_manual(values = c("firebrick1", "dodgerblue")) + | |
theme_bw() + | |
ggtitle("Irene") | |
grid.arrange(g1,g2,g3,g4,ncol=2) | |
#popular names | |
babynames %>% | |
group_by(sex, name) %>% | |
summarize(total = sum(n)) %>% | |
arrange(desc(total)) %$% | |
split(., sex) | |
#total number unique names | |
babynames %$% | |
split(., sex) %>% | |
lapply(. %$% length(unique(name))) | |
#reshape data | |
babywideF <- | |
babynames %>% | |
filter(sex=="F") %>% | |
select(name, year, n) %>% | |
spread(year, n, fill=0) | |
rownames(babywideF)<- babywideF %>% .$name #set rownames | |
babywideF %<>% select(-name) # remove name var | |
#principal components analysis | |
### principal components analysis - females | |
resF.pca <- princomp(babywideF) | |
plot(resF.pca) | |
###k-means clustering analysis | |
set.seed(100) | |
resF.k <- kmeans(babywideF, 6) | |
table(resF.k$cluster) | |
names(resF.k$cluster[resF.k$cluster==2]) | |
names(resF.k$cluster[resF.k$cluster==3]) | |
names(resF.k$cluster[resF.k$cluster==4]) | |
names(resF.k$cluster[resF.k$cluster==5]) | |
names(resF.k$cluster[resF.k$cluster==6]) | |
#cluster 1 | |
set.seed(10) | |
sample(names(resF.k$cluster[resF.k$cluster==1]),10) | |
# Repeat the process | |
# It might be more beneficial to repeat this process, but only include those names | |
# in the top 6 components. To do this we will filter our data by not keeping any names | |
# that appear in cluster 5. This leaves us with 290 names. | |
group1x <- names(resF.k$cluster[resF.k$cluster==1]) | |
babywideF1 <- babywideF %>% mutate(id = rownames(.)) %>% filter(!id %in% group1x) | |
rownames(babywideF1) <- babywideF1$id #using this temp var to re-insert names into rownames (probably not the best way of doing this) | |
babywideF1 %<>% select(-id) | |
### principal components analysis - females | |
resF1.pca <- princomp(babywideF1) | |
plot(resF1.pca) | |
###k-means clustering analysis | |
set.seed(10) | |
resF1.k <- kmeans(babywideF1, 7) | |
table(resF1.k$cluster) | |
names(resF1.k$cluster[resF1.k$cluster==4]) | |
#Mary | |
babynames %>% | |
filter(sex=="F") %>% | |
filter(name=="Mary") %$% | |
ggplot(., aes(year, n)) + | |
geom_line(lwd=1, color="red") + | |
theme_bw() | |
#group 3 | |
group3 <- names(resF1.k$cluster[resF1.k$cluster==3]) | |
gg3 <- | |
babynames %>% | |
filter(sex=="F") %>% | |
filter(name %in% group3) %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=name, group=name), lwd=1) + | |
theme_bw() | |
gg3 | |
#group 6 | |
group6 <- names(resF1.k$cluster[resF1.k$cluster==6]) | |
gg6 <- | |
babynames %>% | |
filter(sex=="F") %>% | |
filter(name %in% group6) %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=name, group=name), lwd=1) + | |
theme_bw() | |
gg6 | |
# Elizabeth | |
babynames %>% | |
filter(sex=="F") %>% | |
filter(name=="Elizabeth") %$% | |
ggplot(., aes(year, n)) + | |
geom_line(lwd=1, color="red") + | |
theme_bw() | |
#group1 | |
group1 <- names(resF1.k$cluster[resF1.k$cluster==1]) | |
gg1 <- | |
babynames %>% | |
filter(sex=="F") %>% | |
filter(name %in% group1) %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=name, group=name), lwd=1) + | |
theme_bw() | |
gg1 | |
#group2 | |
group2 <- names(resF1.k$cluster[resF1.k$cluster==2]) | |
gg2 <- | |
babynames %>% | |
filter(sex=="F") %>% | |
filter(name %in% group2) %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=name, group=name), lwd=1) + | |
theme_bw() | |
gg2 | |
# Group 5 is the second biggest group and it has 66 names. | |
group5 <- names(resF1.k$cluster[resF1.k$cluster==5]) | |
group5 #these are these names | |
gg5 <- | |
babynames %>% | |
filter(sex=="F") %>% | |
filter(name %in% group5) %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=name, group=name), lwd=.5) + | |
theme_bw() + | |
theme(legend.position = "none") | |
gg5 | |
#double boomers | |
babynames %>% | |
filter(sex=="F") %>% | |
filter(name=="Grace" | name=="Julia" | name=="Ella") %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=name), lwd=1) + | |
theme_bw() | |
#group7 | |
group7 <- names(resF1.k$cluster[resF1.k$cluster==7]) | |
group7 # these are these names | |
gg7 <- | |
babynames %>% | |
filter(sex=="F") %>% | |
filter(name %in% group7) %$% | |
ggplot(., aes(year, n)) + | |
geom_line(aes(color=name, group=name), lwd=.5) + | |
theme_bw() + | |
theme(legend.position = "none") | |
gg7 | |
## overview | |
grid.arrange(gg2, gg1, gg3, gg6, ncol=2) | |
#### T-distributed Stochastic Neighbor Embedding. | |
library(tsne) | |
D <- dist(babywideF1) #create distance object | |
# creating dataframe for plotting colors and text on final plot | |
namesdf <- data.frame(name = c(group1, group2, group3, "Mary", group5, group6, group7), | |
group = c(rep(1, length(group1)), rep(2, length(group2)), rep(3, length(group3)), rep(4, 1), | |
rep(5, length(group5)), rep(6, length(group6)), rep(7, length(group7))) | |
) | |
namesdf %<>% arrange(name) #names in correct order to match rownames of babywideF1 | |
colors = rainbow(7) | |
names(colors) = unique(namesdf$group) | |
#define function used in plotting | |
ecb = function(x,y){ plot(x,t='n'); text(x,labels=rownames(babywideF1), col=colors[namesdf$group], cex=1) } | |
#plot | |
tsne_D = tsne(D, k=2, epoch_callback = ecb, perplexity=50) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment