Last active
August 4, 2018 17:01
-
-
Save benmarwick/3d160da203e4abaacd670d895f1ca72a to your computer and use it in GitHub Desktop.
Cluster images traced by hand, using EFA and PCA, and plot
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# to make traces... | |
# Import an EPS file in Inkscape | |
# Trace over it to show half of vessel profile, from top of rim to centre of base, about 20-30 segements | |
# detelete EPS from from Inkscape | |
# export PNG of trace from Inkscape | |
library(magick) | |
lf <- list.files("../traces", full.names = TRUE) | |
for (i in seq_along(jpgs)) { | |
image_write(image_convert(image_read(lf[[i]]), "jpg"), paste0(lf[[i]], "_output.jpg")) | |
} | |
jpgs<- list.files("../traces", pattern = ".jpg", full.names = TRUE) | |
library(Momocs) | |
jpgs_imported <- import_jpg(jpgs, threshold = 0.1) | |
jpgs_imported_coo <- | |
jpgs_imported[-2] %>% | |
Opn() %>% | |
Out() %>% | |
coo_close() %>% | |
coo_center %>% | |
coo_scale %>% | |
coo_align() %>% | |
coo_rotate(theta = pi/2) %>% | |
coo_slidedirection("up") | |
jpgs_imported_coo %>% | |
panel() | |
# efa | |
op <- efourier(jpgs_imported_coo, norm= FALSE) | |
op.p <- PCA(op) | |
plot(op.p, | |
cex = 3, | |
labelspoints = TRUE, | |
cex.labelspoints = 1) | |
PCcontrib(op.p) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# get SVG files from google drive | |
library(googledrive) | |
ID <- "https://drive.google.com/drive/u/1/folders/1wNLi_v40KX-U6s42eHrwueiearLvZ5Jv" | |
g_drive <- drive_ls( path = "Pot_outline_trace", recursive = TRUE, pattern = ".svg$") | |
# download SVGs from google | |
library(tidyverse) | |
map2(g_drive$id, | |
g_drive$name, | |
~as_id(.x) %>% drive_download(path = str_glue('Pottery_outlines/outlines_svg/{.y}'))) | |
lf <- list.files("Pottery_outlines/outlines_svg", | |
full.names = TRUE) | |
library(xml2) | |
remove_scan_and_fill <- function(svg_file) { | |
# remove the scan image from the SVG | |
t1 <- read_xml(svg_file) | |
t2 <- xml_children(t1) | |
scan_layer <- t2[[4]] | |
xml_remove(scan_layer) | |
# now we have a SVG without the scan layer | |
# change from outline to fill | |
stroke_and_fill_node <- xml_children(t1)[4] | |
stroke_and_fill_node_inner <- xml_children(stroke_and_fill_node) | |
xml_attr(stroke_and_fill_node_inner, "style") <- | |
"fill:#000000;stroke:none;stroke-width:5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1" | |
write_xml(t1, | |
str_glue("Pottery_outlines/outlines_svg/fill_{basename(svg_file)}")) | |
# fit image object to canvas in svg so it's all captured during the conversion to JPG | |
system(str_glue('inkscape --verb=EditSelectAll --verb=AlignHorizontalCenter --verb=AlignVerticalCenter --verb=FitCanvasToSelectionOrDrawing --verb=FileSave --verb=FileQuit "$(pwd)/Pottery_outlines/outlines_svg/fill_{basename(svg_file)}"')) | |
} | |
# run the function on all SVGs | |
walk(lf, remove_scan_and_fill) | |
# convert SVG to PNG | |
filled_svgs <- list.files("Pottery_outlines/outlines_svg", pattern = "^fill_", full.names = TRUE) | |
# convert svgs to to jpg | |
library(magick) | |
map(filled_svgs, | |
~image_read_svg(.x) %>% | |
image_background("white") %>% | |
image_convert("jpg") %>% | |
image_write(str_glue("{.x}_output.jpg"))) | |
# from https://gist.github.com/benmarwick/3d160da203e4abaacd670d895f1ca72a | |
#-------------------------------------------------- | |
# Once we have the jpgs, start from here | |
library(tidyverse) | |
# get list of jpgs only | |
jpgs <- list.files("Pottery_outlines/outlines_svg", pattern = ".jpg", full.names = TRUE) | |
# join file names to phase | |
pottery_data <- readr::read_csv("Pottery_outlines/KWL_Ceramic_final2018_with_phases.csv") | |
pottery_data <- pottery_data %>% | |
mutate(filename = str_glue('fill_{filename}.svg_output.jpg')) | |
# some outlines that we have no phases for... | |
sdiff <- setdiff(pottery_data$filename, basename(jpgs)) | |
# the ones we do have both outlines and phases | |
isect <- intersect(pottery_data$filename, basename(jpgs)) | |
jpgs <- jpgs[basename(jpgs) %in% isect] | |
pottery_data <- | |
pottery_data %>% | |
filter(filename %in% isect) %>% | |
distinct(filename, phase, .keep_all = TRUE) | |
phases_isect <- | |
pottery_data %>% | |
select(filename, phase) %>% | |
filter(filename %in% isect) %>% | |
mutate_all(as.factor) %>% | |
distinct(filename, phase) %>% | |
mutate(phase = as.factor(case_when( | |
phase == 'post-e' ~ "Post-European", | |
phase == 'pre-e' ~ "Pre-European", | |
phase == 'ch-con' ~ "Chinese Contact", | |
))) | |
# how many pots in each phase? | |
phases_isect %>% | |
group_by(phase) %>% | |
tally(sort = T) | |
# start shape exploration | |
library(Momocs) | |
# it freezes on the empty ones... | |
jpgs_imported <- import_jpg(jpgs, threshold = 0.5) | |
# Normalise the shapes, from | |
# https://www.repository.cam.ac.uk/bitstream/handle/1810/266423/Supplementary%20Code%20S1.pdf?sequence=20&isAllowed=y | |
jpgs_imported_coo <- | |
jpgs_imported %>% | |
Out(., fac = phases_isect) %>% | |
# Centring, scaling and sampling pseudo-landmarks | |
coo_center %>% | |
coo_scale %>% | |
coo_sample(., 1000) %>% | |
# Procrustes superimposition | |
fgProcrustes %>% | |
# Normalisation of the starting points | |
coo_slidedirection | |
jpgs_imported_coo %>% | |
panel(., fac = phases_isect$phase) | |
pile(jpgs_imported_coo) | |
# check how to do outlines: https://vbonhomme.github.io/Momocs/vignettes/Momocs_speed_dating.html | |
#----------------------------------------------------------- | |
# efa | |
# this is quite slow... | |
op <- efourier(jpgs_imported_coo, norm = FALSE) | |
# The progressive capture of shape geometry along the number of harmonics can be visualized with: | |
calibrate_reconstructions_efourier(jpgs_imported_coo, | |
id = 1, | |
range = c(1, 2, 4, 8, 14, 20)) | |
# mean shape, per group | |
bot.ms <- mshapes(op, 2) | |
`Chinese Contact` <- bot.ms$shp$`Chinese Contact` %T>% coo_plot(border="blue") | |
`Post-European` <- bot.ms$shp$`Post-European` %T>% coo_draw(border="red") | |
`Pre-European` <- bot.ms$shp$`Pre-European` %T>% coo_draw(border="green") | |
legend("topright", | |
lwd=1, | |
col=c("blue", "red", "green"), | |
legend=c("Chinese contact", "Post-European", "Pre-European")) | |
# Thin plate spline analysis (TPS) | |
# Thin plate splines (TPS) was used to compare average shapes and visualise the outline deformations required to pass from an extreme of the morphospace to the other. | |
# Deformation grids | |
png("Pottery_outlines/tps-pre-Euro-to-post-Euro.png", res = 300, w = 900, h = 900) | |
tps_grid(`Pre-European`, `Post-European` , shp.lwd = c(2, 2), shp.border = c("red", "blue"), | |
amp = 1, grid.size = 18, legend = T) | |
dev.off() | |
png("Pottery_outlines/tps-pre-Euro-to-Chinese.png", res = 300, w = 900, h = 900) | |
tps_grid(`Pre-European`, `Chinese Contact`, shp.lwd = c(2, 2), shp.border = c("red", "blue"), | |
amp = 1, grid.size = 18, legend = T) | |
dev.off() | |
# Isodeformation lines | |
tps_iso(`Pre-European`, `Post-European`, shp.lwd = c(2, 2), shp.border = c("red", "blue"), | |
amp = 0.5, iso.nb = 10000, iso.levels = 12, legend.text = c("Pre-contact", "Post-contact"), | |
palette = col_summer, grid = F) | |
#------------------------------------------------------------- | |
# pca | |
# these are fast | |
op.p <- PCA(op) | |
# We can see the pots groups by phase, and their shape distribution | |
# according to the first two principle components of the EFA | |
# PC1 and PC2 explain 79% of variation | |
png("Pottery_outlines/pca-biplot.png", res = 300, w = 1500, h = 1500) | |
plot(op.p, 2, | |
morpho = TRUE, | |
ellipses = TRUE, | |
cex = 1, | |
cex.labelsgroups = 1.5) | |
dev.off() | |
plot(PCA(op), 2, pos.shp="full") | |
plot(PCA(op), 2, pos.shp="range") | |
plot(PCA(op), 2, pos.shp="xy") | |
plot(PCA(op), 2, pos.shp="range_axes") | |
plot(PCA(op), 2, pos.shp="full_axes") | |
# the contributions of each PC to the shapes | |
pc_contrib <- PCcontrib(op.p, nax = 1:3) | |
pc_contrib$gg + | |
geom_polygon(fill="slategrey", col="black") + | |
ggtitle("Shape variation along PC axes") + | |
theme_minimal(base_size = 6) | |
ggsave("Pottery_outlines/pca-contrib.png") | |
scree_plot(op.p) | |
# PC1 is height of vessel | |
# PC2 is neck constriction | |
# PC3 is height of neck and roundness of body | |
#----------------------------------------------------------- | |
# lda | |
# we performed a linear discriminant analysis (LDA) based on the new shape variables (PCs), with a leave-one-out cross-validation procedure, to identify the linear combination of shape features that was able to discriminate between phases | |
# we work on PCA scores | |
bot.l <- LDA(op.p, 2) | |
# print a summary, along with the leave-one-out cross-validation table. | |
bot.l | |
# plot.LDA works pretty much with the same grammar as plot.PCA | |
# here we only have one LD | |
plot(bot.l) | |
# plot the cross-validation table | |
plot_CV(bot.l) | |
# Does not produce an efficient separation between groups and a cross-validation (leave-one-out) at phase level showing a low confidence in the reclassification. But best for Pre-e, indicating these have the most distinctive shapes | |
#-------------------------------------------------- | |
# Unsupervised model-based clustering | |
# what are the best grouping of the pots by shape? | |
library(mclust) | |
BIC <- mclustBIC(op.p$x[ , 1:5]) | |
mod1 <- Mclust(op.p$x[ , 1:5], x = BIC) | |
summary(mod1, parameters = TRUE) | |
plot(mod1, what = "classification") | |
# no clusters apparent | |
#---------------------------------------------------- | |
# MANOVA: Multivariate Analysis of (co)variace | |
# We can test for a difference in the distribution of PC scores by calculating a pairwise combination between every levels of a fac. | |
manv <- MANOVA_PW(op.p, "phase") | |
manv$summary | |
# shows sig diff in shape for all but ch-con vs. post-e | |
# $summary | |
# Df Pillai approx F num Df den Df | |
# ch-con - post-e 1 0.3900054 1.685582 11 29 | |
# ch-con - pre-e 1 0.7083273 7.506274 11 34 | |
# post-e - pre-e 1 0.3677487 2.485230 11 47 | |
# Pr(>F) | |
# ch-con - post-e 1.270691e-01 | |
# ch-con - pre-e 2.411143e-06 | |
# post-e - pre-e 1.506824e-02 | |
#---------------------------------------------------- | |
# GAMMS from https://www-nature-com.offcampus.lib.washington.edu/articles/s41598-018-20122-9 | |
# Generalised additive mixed models (GAMMs) were used to explain shape variance with respect to mean environmental parameters and shell size, and to compare between individual shape features (PCs). A GAMM is a generalised linear mixed model with a linear predictor involving a sum of smooth functions of covariates. The model allowed for flexible specification of the dependence of the response on the covariates by defining regression splines (smoothing functions) and estimating their optimal degree of smoothness, rather than calculating parametric relationships | |
ggplot(pottery_data, | |
aes(phase, | |
as.numeric(Thick1_Neck))) + | |
geom_boxplot() | |
pottery_data$thick_neck1 <- as.numeric(pottery_data$Thick1_Neck) | |
pottery_data$thick_body1 <- as.numeric(pottery_data$Thick1_Body) | |
pottery_data$height <- as.numeric(pottery_data$Hight) | |
plot(op.p$x[ , 2], pottery_data$thick_body1) | |
gams_input <- data.frame(thick_body1 = pottery_data$thick_body1 , | |
height = pottery_data$height, | |
shape_pc1 = op.p$x[ , 1]) | |
library(brms) | |
m2 <- brm(bf(shape_pc1 ~ s(thick_body1)), | |
data = gams_input, family = gaussian(), cores = 4, seed = 17, | |
iter = 4000, warmup = 1000, thin = 10, refresh = 0, | |
control = list(adapt_delta = 0.99)) | |
summary(m2) # so many zeros... not very promising | |
# plot marginal effects for each predictor | |
plot(marginal_effects(m2), ask = FALSE) | |
loo(m2) | |
pp_check(m2) | |
#---------------------------------------------------- | |
# Spatial | |
#---------------------------------------------------- | |
# CVs on shape PCs for standardisation | |
# plot PC1-3 distribution by phase | |
pc_plot <- data_frame(pc = c(op.p$x[,1], | |
op.p$x[,2], | |
op.p$x[,3]), | |
phase = rep(op.p$fac$phase, 3), | |
pcn = str_glue('PC{unlist(map(1:3, ~rep(.x, length(op.p$x[,1]))))}')) | |
ggplot(pc_plot, | |
aes(phase, | |
pc)) + | |
geom_violin() + | |
ggforce::geom_sina() + | |
facet_wrap(~ pcn, ncol = 1) + | |
theme_minimal() | |
ggsave("Pottery_outlines/pcs-boxplot-for-shape-variation.png", | |
h = 10, w = 5) | |
library(cvequality) | |
pc1 <- data.frame( | |
x = op.p$x[,1], | |
y = as.character(op.p$fac$phase), | |
stringsAsFactors = F) | |
cv_test_pc1 <- mslr_test(nr = 1000, pc1$x, pc1$y) | |
pc2 <- data.frame( | |
x = op.p$x[,2], | |
y = as.character(op.p$fac$phase), | |
stringsAsFactors = F) | |
cv_test_pc2 <- mslr_test(nr = 1000, pc2$x, pc2$y) | |
pc3 <- data.frame( | |
x = op.p$x[,3], | |
y = as.character(op.p$fac$phase), | |
stringsAsFactors = F) | |
cv_test_pc3 <- mslr_test(nr = 1000, pc3$x, pc3$y) | |
# do this pairwise | |
cv_test_pc1.1 <- | |
mslr_test(nr = 1000, | |
pc1$x[pc1$y %in% c("Post-European", | |
"Pre-European")], | |
pc1$y[pc1$y %in% c("Post-European", | |
"Pre-European")]) | |
cv_test_pc1.2 <- | |
mslr_test(nr = 1000, | |
pc1$x[pc1$y %in% c("Chinese Contact", | |
"Pre-European")], | |
pc1$y[pc1$y %in% c("Chinese Contact", | |
"Pre-European")]) | |
cv_test_pc1.3 <- | |
mslr_test(nr = 1000, | |
pc1$x[pc1$y %in% c("Chinese Contact", | |
"Post-European")], | |
pc1$y[pc1$y %in% c("Chinese Contact", | |
"Post-European")]) | |
PC1_cv_tests <- | |
bind_rows(cv_test_pc1.1, cv_test_pc1.2, cv_test_pc1.3) %>% | |
bind_cols(phases = c("Post-European vs Pre-European", | |
"Chinese Contact vs Pre-European", | |
"Chinese Contact vs Post-European")) | |
cv_test_pc2.1 <- | |
mslr_test(nr = 1000, | |
pc2$x[pc2$y %in% c("Post-European", | |
"Pre-European")], | |
pc2$y[pc2$y %in% c("Post-European", | |
"Pre-European")]) | |
cv_test_pc2.2 <- | |
mslr_test(nr = 1000, | |
pc2$x[pc2$y %in% c("Chinese Contact", | |
"Pre-European")], | |
pc2$y[pc2$y %in% c("Chinese Contact", | |
"Pre-European")]) | |
cv_test_pc2.3 <- | |
mslr_test(nr = 1000, | |
pc2$x[pc2$y %in% c("Chinese Contact", | |
"Post-European")], | |
pc2$y[pc2$y %in% c("Chinese Contact", | |
"Post-European")]) | |
PC2_cv_tests <- | |
bind_rows(cv_test_pc2.1, | |
cv_test_pc2.2, | |
cv_test_pc2.3) %>% | |
bind_cols(phases = c("Post-European vs Pre-European", | |
"Chinese Contact vs Pre-European", | |
"Chinese Contact vs Post-European")) | |
bind_rows(PC1_cv_tests, | |
PC2_cv_tests, .id = "PC") %>% | |
mutate(PC = str_glue('PC{PC}')) %>% | |
mutate(colour = if_else(p_value < 0.05, "red", "black")) %>% | |
ggplot(aes(str_wrap(phases, 40), | |
p_value, | |
colour = I(colour))) + | |
geom_point(size = 8) + | |
facet_wrap(~ PC, ncol = 1) + | |
theme_minimal(base_size = 12) + | |
coord_cartesian(clip = 'off') + | |
xlab("") + | |
ylab("CV equality test p-values") | |
ggsave("Pottery_outlines/cv-test-on-pcs.png", h = 3, w = 12) | |
Author
benmarwick
commented
May 19, 2018
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment