jalapic · April 6, 2015 20:06
diff --git a/gistfile1.txt b/gistfile1.txt
 ### load packages
 library(babynames) 
 library(dplyr) 
 library(tidyr)
 library(ggplot2)
 library(gridExtra)
 library(magrittr)


 head(babynames)

 tail(babynames)

 g1 <- babynames %>%
  filter(name=="Barbara") %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=sex), lwd=1) +
  scale_color_manual(values = c("firebrick1", "dodgerblue")) +
  theme_bw() +
  ggtitle("Barbara")

 g2 <- babynames %>%
  filter(name=="Megan") %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=sex), lwd=1) +
  scale_color_manual(values = c("firebrick1", "dodgerblue")) +
  theme_bw() +
  ggtitle("Megan")

 g3 <- babynames %>%
  filter(name=="Jennifer") %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=sex), lwd=1) +
  scale_color_manual(values = c("firebrick1", "dodgerblue")) +
  theme_bw() +
  ggtitle("Jennifer")

 g4 <- babynames %>%
  filter(name=="Irene") %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=sex), lwd=1) +
  scale_color_manual(values = c("firebrick1", "dodgerblue")) +
  theme_bw() +
  ggtitle("Irene")

 grid.arrange(g1,g2,g3,g4,ncol=2)


 #popular names
 babynames %>%
  group_by(sex, name) %>%
  summarize(total = sum(n)) %>%
  arrange(desc(total)) %$%
  split(., sex) 

 #total number unique names
 babynames %$%
  split(., sex) %>%
  lapply(. %$% length(unique(name)))


 #reshape data
 babywideF <- 
  babynames %>% 
  filter(sex=="F") %>% 
  select(name, year, n) %>%
  spread(year, n, fill=0)

 rownames(babywideF)<- babywideF %>% .$name  #set rownames
 babywideF %<>% select(-name) # remove name var



 #principal components analysis
 ### principal components analysis - females
 resF.pca <- princomp(babywideF)
 plot(resF.pca)



 ###k-means clustering analysis
 set.seed(100)
 resF.k <- kmeans(babywideF, 6)
 table(resF.k$cluster)

 names(resF.k$cluster[resF.k$cluster==2])
 names(resF.k$cluster[resF.k$cluster==3])
 names(resF.k$cluster[resF.k$cluster==4])
 names(resF.k$cluster[resF.k$cluster==5])
 names(resF.k$cluster[resF.k$cluster==6])

 #cluster 1
 set.seed(10)
 sample(names(resF.k$cluster[resF.k$cluster==1]),10)



 # Repeat the process
 # It might be more beneficial to repeat this process, but only include those names
 # in the top 6 components. To do this we will filter our data by not keeping any names 
 # that appear in cluster 5. This leaves us with 290 names.

 group1x <- names(resF.k$cluster[resF.k$cluster==1])
 babywideF1 <- babywideF %>% mutate(id = rownames(.)) %>% filter(!id %in% group1x) 
 rownames(babywideF1) <- babywideF1$id #using this temp var to re-insert names into rownames (probably not the best way of doing this)
 babywideF1 %<>% select(-id) 

 ### principal components analysis - females
 resF1.pca <- princomp(babywideF1)
 plot(resF1.pca)


 ###k-means clustering analysis
 set.seed(10)
 resF1.k <- kmeans(babywideF1, 7)
 table(resF1.k$cluster)

 names(resF1.k$cluster[resF1.k$cluster==4])


 #Mary
 babynames %>%
  filter(sex=="F") %>%
  filter(name=="Mary") %$% 
  ggplot(., aes(year, n)) +
  geom_line(lwd=1, color="red") +
  theme_bw()


 #group 3
 group3 <- names(resF1.k$cluster[resF1.k$cluster==3])

 gg3 <- 
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group3) %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=1) +
  theme_bw()

 gg3


 #group 6
 group6 <- names(resF1.k$cluster[resF1.k$cluster==6])

 gg6 <- 
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group6) %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=1) +
  theme_bw()

 gg6


 # Elizabeth
 babynames %>%
  filter(sex=="F") %>%
  filter(name=="Elizabeth") %$% 
  ggplot(., aes(year, n)) +
  geom_line(lwd=1, color="red") +
  theme_bw()

 #group1
 group1 <- names(resF1.k$cluster[resF1.k$cluster==1])

 gg1 <- 
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group1) %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=1) +
  theme_bw()

 gg1


 #group2
 group2 <- names(resF1.k$cluster[resF1.k$cluster==2])

 gg2 <- 
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group2) %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=1) +
  theme_bw()

 gg2



 # Group 5 is the second biggest group and it has 66 names.

 group5 <- names(resF1.k$cluster[resF1.k$cluster==5])

 group5 #these are these names


 gg5 <- 
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group5) %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=.5) +
  theme_bw() +
  theme(legend.position = "none")


 gg5



 #double boomers
 babynames %>%
  filter(sex=="F") %>%
  filter(name=="Grace" | name=="Julia" | name=="Ella") %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name), lwd=1) +
  theme_bw()


 #group7
 group7 <- names(resF1.k$cluster[resF1.k$cluster==7])
 group7 # these are these names


 gg7 <- 
  babynames %>%
  filter(sex=="F") %>%
  filter(name %in% group7) %$% 
  ggplot(., aes(year, n)) +
  geom_line(aes(color=name, group=name), lwd=.5) +
  theme_bw() +
  theme(legend.position = "none")

 gg7


 ## overview

 grid.arrange(gg2, gg1, gg3, gg6, ncol=2)


 ####  T-distributed Stochastic Neighbor Embedding. 

 library(tsne)
 D <- dist(babywideF1)  #create distance object


 # creating dataframe for plotting colors and text on final plot
 namesdf <- data.frame(name = c(group1, group2, group3, "Mary", group5, group6, group7), 
                      group = c(rep(1, length(group1)), rep(2, length(group2)), rep(3, length(group3)), rep(4, 1),
                                rep(5, length(group5)), rep(6, length(group6)), rep(7, length(group7)))
 )

 namesdf %<>% arrange(name) #names in correct order to match rownames of babywideF1 

 colors = rainbow(7)
 names(colors) = unique(namesdf$group)

 #define function used in plotting
 ecb = function(x,y){ plot(x,t='n'); text(x,labels=rownames(babywideF1), col=colors[namesdf$group], cex=1) }

 #plot
 tsne_D = tsne(D, k=2,  epoch_callback = ecb, perplexity=50)
	### load packages
	library(babynames)
	library(dplyr)
	library(tidyr)
	library(ggplot2)
	library(gridExtra)
	library(magrittr)


	head(babynames)

	tail(babynames)

	g1 <- babynames %>%
	filter(name=="Barbara") %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=sex), lwd=1) +
	scale_color_manual(values = c("firebrick1", "dodgerblue")) +
	theme_bw() +
	ggtitle("Barbara")

	g2 <- babynames %>%
	filter(name=="Megan") %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=sex), lwd=1) +
	scale_color_manual(values = c("firebrick1", "dodgerblue")) +
	theme_bw() +
	ggtitle("Megan")

	g3 <- babynames %>%
	filter(name=="Jennifer") %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=sex), lwd=1) +
	scale_color_manual(values = c("firebrick1", "dodgerblue")) +
	theme_bw() +
	ggtitle("Jennifer")

	g4 <- babynames %>%
	filter(name=="Irene") %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=sex), lwd=1) +
	scale_color_manual(values = c("firebrick1", "dodgerblue")) +
	theme_bw() +
	ggtitle("Irene")

	grid.arrange(g1,g2,g3,g4,ncol=2)


	#popular names
	babynames %>%
	group_by(sex, name) %>%
	summarize(total = sum(n)) %>%
	arrange(desc(total)) %$%
	split(., sex)

	#total number unique names
	babynames %$%
	split(., sex) %>%
	lapply(. %$% length(unique(name)))


	#reshape data
	babywideF <-
	babynames %>%
	filter(sex=="F") %>%
	select(name, year, n) %>%
	spread(year, n, fill=0)

	rownames(babywideF)<- babywideF %>% .$name #set rownames
	babywideF %<>% select(-name) # remove name var



	#principal components analysis
	### principal components analysis - females
	resF.pca <- princomp(babywideF)
	plot(resF.pca)



	###k-means clustering analysis
	set.seed(100)
	resF.k <- kmeans(babywideF, 6)
	table(resF.k$cluster)

	names(resF.k$cluster[resF.k$cluster==2])
	names(resF.k$cluster[resF.k$cluster==3])
	names(resF.k$cluster[resF.k$cluster==4])
	names(resF.k$cluster[resF.k$cluster==5])
	names(resF.k$cluster[resF.k$cluster==6])

	#cluster 1
	set.seed(10)
	sample(names(resF.k$cluster[resF.k$cluster==1]),10)



	# Repeat the process
	# It might be more beneficial to repeat this process, but only include those names
	# in the top 6 components. To do this we will filter our data by not keeping any names
	# that appear in cluster 5. This leaves us with 290 names.

	group1x <- names(resF.k$cluster[resF.k$cluster==1])
	babywideF1 <- babywideF %>% mutate(id = rownames(.)) %>% filter(!id %in% group1x)
	rownames(babywideF1) <- babywideF1$id #using this temp var to re-insert names into rownames (probably not the best way of doing this)
	babywideF1 %<>% select(-id)

	### principal components analysis - females
	resF1.pca <- princomp(babywideF1)
	plot(resF1.pca)


	###k-means clustering analysis
	set.seed(10)
	resF1.k <- kmeans(babywideF1, 7)
	table(resF1.k$cluster)

	names(resF1.k$cluster[resF1.k$cluster==4])


	#Mary
	babynames %>%
	filter(sex=="F") %>%
	filter(name=="Mary") %$%
	ggplot(., aes(year, n)) +
	geom_line(lwd=1, color="red") +
	theme_bw()


	#group 3
	group3 <- names(resF1.k$cluster[resF1.k$cluster==3])

	gg3 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group3) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=1) +
	theme_bw()

	gg3


	#group 6
	group6 <- names(resF1.k$cluster[resF1.k$cluster==6])

	gg6 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group6) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=1) +
	theme_bw()

	gg6


	# Elizabeth
	babynames %>%
	filter(sex=="F") %>%
	filter(name=="Elizabeth") %$%
	ggplot(., aes(year, n)) +
	geom_line(lwd=1, color="red") +
	theme_bw()

	#group1
	group1 <- names(resF1.k$cluster[resF1.k$cluster==1])

	gg1 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group1) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=1) +
	theme_bw()

	gg1


	#group2
	group2 <- names(resF1.k$cluster[resF1.k$cluster==2])

	gg2 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group2) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=1) +
	theme_bw()

	gg2



	# Group 5 is the second biggest group and it has 66 names.

	group5 <- names(resF1.k$cluster[resF1.k$cluster==5])

	group5 #these are these names


	gg5 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group5) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=.5) +
	theme_bw() +
	theme(legend.position = "none")


	gg5



	#double boomers
	babynames %>%
	filter(sex=="F") %>%
	filter(name=="Grace" \| name=="Julia" \| name=="Ella") %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name), lwd=1) +
	theme_bw()


	#group7
	group7 <- names(resF1.k$cluster[resF1.k$cluster==7])
	group7 # these are these names


	gg7 <-
	babynames %>%
	filter(sex=="F") %>%
	filter(name %in% group7) %$%
	ggplot(., aes(year, n)) +
	geom_line(aes(color=name, group=name), lwd=.5) +
	theme_bw() +
	theme(legend.position = "none")

	gg7


	## overview

	grid.arrange(gg2, gg1, gg3, gg6, ncol=2)


	#### T-distributed Stochastic Neighbor Embedding.

	library(tsne)
	D <- dist(babywideF1) #create distance object


	# creating dataframe for plotting colors and text on final plot
	namesdf <- data.frame(name = c(group1, group2, group3, "Mary", group5, group6, group7),
	group = c(rep(1, length(group1)), rep(2, length(group2)), rep(3, length(group3)), rep(4, 1),
	rep(5, length(group5)), rep(6, length(group6)), rep(7, length(group7)))
	)

	namesdf %<>% arrange(name) #names in correct order to match rownames of babywideF1

	colors = rainbow(7)
	names(colors) = unique(namesdf$group)

	#define function used in plotting
	ecb = function(x,y){ plot(x,t='n'); text(x,labels=rownames(babywideF1), col=colors[namesdf$group], cex=1) }

	#plot
	tsne_D = tsne(D, k=2, epoch_callback = ecb, perplexity=50)