nkrabben · May 1, 2017 21:30
diff --git a/ndsr_skill_cluster.r b/ndsr_skill_cluster.r
 library(readr)
 library(dplyr)
 library(tidyr)
 library(factoextra)

 # Read data, drop last 3 columns that don't ask about skills
 osf_url = "https://files.osf.io/v1/resources/zndwq/providers/osfstorage/570be0eab83f6901d62b19d9"
 responses <- read_csv(osf_url)[,1:30]

 # Reshape raw data into number of responses per importance level
 responses %>% 
  # Fix typo
  rename(NeedsAssessGapAnalyses = NeedsAssessGayAnalyses) %>%
  gather(skill) %>%
  group_by(skill, value) %>%
  summarize(n = n()) %>%
  ungroup() %>%
  spread(value, n, fill = 0) %>%
  # Move back to data.frame since tibbles can't have row labels
  as.data.frame()->
  skills

 # Add skill names as row labels and drop skill column
 rownames(skills) <- skills$skill 
 skills <- skills[,-1]

 # Run cluster analysis with repeatable seed
 set.seed(42)
 skill_clusters <- eclust(skills, "kmeans", k = 3, nstart = 30, graph = F)

 # Visualize k-means clusters
 fviz_cluster(skill_clusters, ellipse.type = "norm", repel = T) +
  labs(x = "55% of variance in necessity", y = "25% of variance in necessity",
       title = "NDSR Project Skills Grouped by Variance of Importance") +
  # Add meaning to cluster labels, wish there was a convenience function to do this
  scale_shape_discrete(name="Skill Need",
                       breaks=c(1, 2, 3),
                       labels=c("Required Skills", "Niche Skills", "Nice-to-Have Skills")) +
  scale_fill_discrete(name="Skill Need",
                      breaks=c(1, 2, 3),
                      labels=c("Required Skills", "Niche Skills", "Nice-to-Have Skills")) +
  scale_color_discrete(name="Skill Need",
                       breaks=c(1, 2, 3),
                       labels=c("Required Skills", "Niche Skills", "Nice-to-Have Skills")) + 
  ggsave('ndsr_skill_cluster.png')
	library(readr)
	library(dplyr)
	library(tidyr)
	library(factoextra)

	# Read data, drop last 3 columns that don't ask about skills
	osf_url = "https://files.osf.io/v1/resources/zndwq/providers/osfstorage/570be0eab83f6901d62b19d9"
	responses <- read_csv(osf_url)[,1:30]

	# Reshape raw data into number of responses per importance level
	responses %>%
	# Fix typo
	rename(NeedsAssessGapAnalyses = NeedsAssessGayAnalyses) %>%
	gather(skill) %>%
	group_by(skill, value) %>%
	summarize(n = n()) %>%
	ungroup() %>%
	spread(value, n, fill = 0) %>%
	# Move back to data.frame since tibbles can't have row labels
	as.data.frame()->
	skills

	# Add skill names as row labels and drop skill column
	rownames(skills) <- skills$skill
	skills <- skills[,-1]

	# Run cluster analysis with repeatable seed
	set.seed(42)
	skill_clusters <- eclust(skills, "kmeans", k = 3, nstart = 30, graph = F)

	# Visualize k-means clusters
	fviz_cluster(skill_clusters, ellipse.type = "norm", repel = T) +
	labs(x = "55% of variance in necessity", y = "25% of variance in necessity",
	title = "NDSR Project Skills Grouped by Variance of Importance") +
	# Add meaning to cluster labels, wish there was a convenience function to do this
	scale_shape_discrete(name="Skill Need",
	breaks=c(1, 2, 3),
	labels=c("Required Skills", "Niche Skills", "Nice-to-Have Skills")) +
	scale_fill_discrete(name="Skill Need",
	breaks=c(1, 2, 3),
	labels=c("Required Skills", "Niche Skills", "Nice-to-Have Skills")) +
	scale_color_discrete(name="Skill Need",
	breaks=c(1, 2, 3),
	labels=c("Required Skills", "Niche Skills", "Nice-to-Have Skills")) +
	ggsave('ndsr_skill_cluster.png')