Last active
September 3, 2020 21:23
-
-
Save jmclawson/79cd732fb9e6a4f1dd14f1caaac4ee2d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(stylo) | |
library(ggplot2) | |
library(dendextend) | |
# Load with this line: devtools::source_gist("79cd732fb9e6a4f1dd14f1caaac4ee2d") | |
# Use df <- stylo() to save frequency results | |
# Then use stylo2gg(df) to visualize principal components | |
# Use stylo2gg(df, viz="hc") to show hierarchical clusters without rerunning stylo | |
stylo2gg <- function(df, | |
viz = "pca", | |
num.features = NULL, | |
labeling = NULL, | |
classing = NULL, | |
scaling = FALSE, | |
linkage = "ward.D", | |
distance = "delta", | |
horiz = TRUE, | |
invert.x = FALSE, | |
invert.y = FALSE) { | |
if (is.null(num.features)) { | |
num.features <- length(df$features.actually.used) | |
} | |
if (viz == "PCR") { | |
viz <- "pca" | |
scaling <- TRUE | |
} | |
df <- df$table.with.all.freqs %>% | |
.[,df$features.actually.used[1:num.features]] %>% | |
as.data.frame() | |
# df_summary <- rbind(avg = colMeans(df), | |
# sd = apply(df, 2, sd)) | |
df_means <- colMeans(df) | |
df_sd <- apply(df, 2, sd) | |
## create table of z scores | |
corpus_zscores <- list() | |
for (row_i in rownames(df)) { | |
thisrow <- (df[row_i, ] - df_means) / df_sd | |
corpus_zscores[[row_i]] <- thisrow | |
} | |
df_z <- data.frame(matrix(unlist(corpus_zscores), | |
nrow = length(corpus_zscores), | |
byrow = T)) | |
rownames(df_z) <- names(corpus_zscores) | |
colnames(df_z) <- colnames(corpus_zscores[[1]]) | |
if (viz == "PCV") { | |
viz <- "pca" | |
df_z <- df | |
} | |
if (viz == "pca" || viz == "PCA" || viz == "PCR") { | |
df_pca <- prcomp(df_z, scale. = scaling) | |
pc_variance <- summary(df_pca)$importance[2,1:2] | |
df_pca <- df_pca$x %>% | |
as.data.frame() | |
if (invert.x) { | |
df_pca$PC1 <- df_pca$PC1 * -1 | |
} | |
if (invert.y) { | |
df_pca$PC2 <- df_pca$PC2 * -1 | |
} | |
if (is.null(classing)) { | |
df_pca$class <- df_pca %>% | |
rownames() %>% | |
strsplit("_") %>% | |
sapply(`[`, 1) | |
} else { | |
df_pca$class <- classing | |
} | |
df_pca$title <- df_pca %>% | |
rownames() %>% | |
strsplit("_") %>% | |
sapply(`[`, 2) | |
df_pca$shorttitle <- df_pca$title %>% | |
gsub(pattern = "[a-z]", | |
replacement = "", | |
x = .) | |
num_shapes <- df_pca$class %>% | |
unique() %>% | |
length() | |
the_plot <- df_pca %>% | |
ggplot(aes(PC1, | |
PC2)) + | |
geom_hline(yintercept = 0, color = "gray") + | |
geom_vline(xintercept = 0, color = "gray") | |
if (is.null(labeling)) { | |
the_plot <- the_plot + | |
geom_point(aes(shape = class, | |
color = class)) + | |
scale_shape_manual(values = rep(c(1, 3:11), length.out = num_shapes)) | |
} else { | |
the_plot <- the_plot + | |
geom_text(aes(label = labeling, | |
color = class), show.legend = FALSE) | |
} | |
the_plot <- the_plot + | |
theme_bw() + | |
theme(legend.title = element_blank()) + | |
labs(x = paste0("PC1 (", | |
round(pc_variance[1]*100,1), | |
"%)"), | |
y = paste0("PC2 (", | |
round(pc_variance[2]*100,1), | |
"%)")) | |
} else if (viz == "hc" || viz == "ca" || viz == "CA" || viz == "HC") { | |
if (!distance == "euclidean") { | |
df <- df_z | |
} | |
if (is.null(classing)){ | |
the_class <- df %>% | |
rownames() %>% | |
strsplit("_") %>% | |
sapply(`[`, 1) | |
} else { | |
the_class <- classing | |
} | |
gg_color <- function(n) { | |
hues = seq(15, 375, length = n + 1) | |
hcl(h = hues, l = 65, c = 100)[1:n] | |
} | |
the_colors <- gg_color(length(unique(the_class))) | |
if (!is.null(labeling)) { | |
rownames(df) <- labeling | |
} | |
rownames(df) <- paste0(" ", rownames(df)) | |
the_dend <- df %>% | |
as.matrix() | |
if (distance == "argamon") { | |
the_dend <- the_dend %>% | |
dist.argamon() | |
} else if (distance == "eder") { | |
the_dend <- the_dend %>% | |
dist.eder() | |
} else if (distance == "cosine") { | |
the_dend <- the_dend %>% | |
dist.cosine() | |
} else if (distance == "simple") { | |
the_dend <- the_dend %>% | |
dist.simple() | |
} else if (distance == "delta") {# default to delta | |
the_dend <- the_dend %>% | |
dist.delta() | |
} else { | |
the_dend <- the_dend %>% | |
dist(method = distance) | |
} | |
the_dend <- the_dend %>% | |
as.dist() %>% | |
hclust(method = linkage) %>% | |
as.dendrogram() %>% | |
set("branches_lwd", 0.7) %>% | |
# set("branches_k_color", k = 3) %>% | |
set("labels_cex", 0.7) %>% set("hang_leaves", 0) | |
labels_colors(the_dend) <- the_class %>% | |
as.factor() %>% | |
.[order.dendrogram(the_dend)] %>% | |
the_colors[.] | |
the_plot <- the_dend %>% ggplot(horiz = horiz) | |
} | |
return(the_plot) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Instead of this gist, please use the the stylo2gg package, which does more and does it better.