Skip to content

Instantly share code, notes, and snippets.

@benmarwick
Last active August 4, 2018 17:01
Show Gist options
  • Save benmarwick/3d160da203e4abaacd670d895f1ca72a to your computer and use it in GitHub Desktop.
Save benmarwick/3d160da203e4abaacd670d895f1ca72a to your computer and use it in GitHub Desktop.
Cluster images traced by hand, using EFA and PCA, and plot
# to make traces...
# Import an EPS file in Inkscape
# Trace over it to show half of vessel profile, from top of rim to centre of base, about 20-30 segements
# detelete EPS from from Inkscape
# export PNG of trace from Inkscape
library(magick)
lf <- list.files("../traces", full.names = TRUE)
for (i in seq_along(jpgs)) {
image_write(image_convert(image_read(lf[[i]]), "jpg"), paste0(lf[[i]], "_output.jpg"))
}
jpgs<- list.files("../traces", pattern = ".jpg", full.names = TRUE)
library(Momocs)
jpgs_imported <- import_jpg(jpgs, threshold = 0.1)
jpgs_imported_coo <-
jpgs_imported[-2] %>%
Opn() %>%
Out() %>%
coo_close() %>%
coo_center %>%
coo_scale %>%
coo_align() %>%
coo_rotate(theta = pi/2) %>%
coo_slidedirection("up")
jpgs_imported_coo %>%
panel()
# efa
op <- efourier(jpgs_imported_coo, norm= FALSE)
op.p <- PCA(op)
plot(op.p,
cex = 3,
labelspoints = TRUE,
cex.labelspoints = 1)
PCcontrib(op.p)
# get SVG files from google drive
library(googledrive)
ID <- "https://drive.google.com/drive/u/1/folders/1wNLi_v40KX-U6s42eHrwueiearLvZ5Jv"
g_drive <- drive_ls( path = "Pot_outline_trace", recursive = TRUE, pattern = ".svg$")
# download SVGs from google
library(tidyverse)
map2(g_drive$id,
g_drive$name,
~as_id(.x) %>% drive_download(path = str_glue('Pottery_outlines/outlines_svg/{.y}')))
lf <- list.files("Pottery_outlines/outlines_svg",
full.names = TRUE)
library(xml2)
remove_scan_and_fill <- function(svg_file) {
# remove the scan image from the SVG
t1 <- read_xml(svg_file)
t2 <- xml_children(t1)
scan_layer <- t2[[4]]
xml_remove(scan_layer)
# now we have a SVG without the scan layer
# change from outline to fill
stroke_and_fill_node <- xml_children(t1)[4]
stroke_and_fill_node_inner <- xml_children(stroke_and_fill_node)
xml_attr(stroke_and_fill_node_inner, "style") <-
"fill:#000000;stroke:none;stroke-width:5;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-dasharray:none;stroke-opacity:1"
write_xml(t1,
str_glue("Pottery_outlines/outlines_svg/fill_{basename(svg_file)}"))
# fit image object to canvas in svg so it's all captured during the conversion to JPG
system(str_glue('inkscape --verb=EditSelectAll --verb=AlignHorizontalCenter --verb=AlignVerticalCenter --verb=FitCanvasToSelectionOrDrawing --verb=FileSave --verb=FileQuit "$(pwd)/Pottery_outlines/outlines_svg/fill_{basename(svg_file)}"'))
}
# run the function on all SVGs
walk(lf, remove_scan_and_fill)
# convert SVG to PNG
filled_svgs <- list.files("Pottery_outlines/outlines_svg", pattern = "^fill_", full.names = TRUE)
# convert svgs to to jpg
library(magick)
map(filled_svgs,
~image_read_svg(.x) %>%
image_background("white") %>%
image_convert("jpg") %>%
image_write(str_glue("{.x}_output.jpg")))
# from https://gist.github.com/benmarwick/3d160da203e4abaacd670d895f1ca72a
#--------------------------------------------------
# Once we have the jpgs, start from here
library(tidyverse)
# get list of jpgs only
jpgs <- list.files("Pottery_outlines/outlines_svg", pattern = ".jpg", full.names = TRUE)
# join file names to phase
pottery_data <- readr::read_csv("Pottery_outlines/KWL_Ceramic_final2018_with_phases.csv")
pottery_data <- pottery_data %>%
mutate(filename = str_glue('fill_{filename}.svg_output.jpg'))
# some outlines that we have no phases for...
sdiff <- setdiff(pottery_data$filename, basename(jpgs))
# the ones we do have both outlines and phases
isect <- intersect(pottery_data$filename, basename(jpgs))
jpgs <- jpgs[basename(jpgs) %in% isect]
pottery_data <-
pottery_data %>%
filter(filename %in% isect) %>%
distinct(filename, phase, .keep_all = TRUE)
phases_isect <-
pottery_data %>%
select(filename, phase) %>%
filter(filename %in% isect) %>%
mutate_all(as.factor) %>%
distinct(filename, phase) %>%
mutate(phase = as.factor(case_when(
phase == 'post-e' ~ "Post-European",
phase == 'pre-e' ~ "Pre-European",
phase == 'ch-con' ~ "Chinese Contact",
)))
# how many pots in each phase?
phases_isect %>%
group_by(phase) %>%
tally(sort = T)
# start shape exploration
library(Momocs)
# it freezes on the empty ones...
jpgs_imported <- import_jpg(jpgs, threshold = 0.5)
# Normalise the shapes, from
# https://www.repository.cam.ac.uk/bitstream/handle/1810/266423/Supplementary%20Code%20S1.pdf?sequence=20&isAllowed=y
jpgs_imported_coo <-
jpgs_imported %>%
Out(., fac = phases_isect) %>%
# Centring, scaling and sampling pseudo-landmarks
coo_center %>%
coo_scale %>%
coo_sample(., 1000) %>%
# Procrustes superimposition
fgProcrustes %>%
# Normalisation of the starting points
coo_slidedirection
jpgs_imported_coo %>%
panel(., fac = phases_isect$phase)
pile(jpgs_imported_coo)
# check how to do outlines: https://vbonhomme.github.io/Momocs/vignettes/Momocs_speed_dating.html
#-----------------------------------------------------------
# efa
# this is quite slow...
op <- efourier(jpgs_imported_coo, norm = FALSE)
# The progressive capture of shape geometry along the number of harmonics can be visualized with:
calibrate_reconstructions_efourier(jpgs_imported_coo,
id = 1,
range = c(1, 2, 4, 8, 14, 20))
# mean shape, per group
bot.ms <- mshapes(op, 2)
`Chinese Contact` <- bot.ms$shp$`Chinese Contact` %T>% coo_plot(border="blue")
`Post-European` <- bot.ms$shp$`Post-European` %T>% coo_draw(border="red")
`Pre-European` <- bot.ms$shp$`Pre-European` %T>% coo_draw(border="green")
legend("topright",
lwd=1,
col=c("blue", "red", "green"),
legend=c("Chinese contact", "Post-European", "Pre-European"))
# Thin plate spline analysis (TPS)
# Thin plate splines (TPS) was used to compare average shapes and visualise the outline deformations required to pass from an extreme of the morphospace to the other.
# Deformation grids
png("Pottery_outlines/tps-pre-Euro-to-post-Euro.png", res = 300, w = 900, h = 900)
tps_grid(`Pre-European`, `Post-European` , shp.lwd = c(2, 2), shp.border = c("red", "blue"),
amp = 1, grid.size = 18, legend = T)
dev.off()
png("Pottery_outlines/tps-pre-Euro-to-Chinese.png", res = 300, w = 900, h = 900)
tps_grid(`Pre-European`, `Chinese Contact`, shp.lwd = c(2, 2), shp.border = c("red", "blue"),
amp = 1, grid.size = 18, legend = T)
dev.off()
# Isodeformation lines
tps_iso(`Pre-European`, `Post-European`, shp.lwd = c(2, 2), shp.border = c("red", "blue"),
amp = 0.5, iso.nb = 10000, iso.levels = 12, legend.text = c("Pre-contact", "Post-contact"),
palette = col_summer, grid = F)
#-------------------------------------------------------------
# pca
# these are fast
op.p <- PCA(op)
# We can see the pots groups by phase, and their shape distribution
# according to the first two principle components of the EFA
# PC1 and PC2 explain 79% of variation
png("Pottery_outlines/pca-biplot.png", res = 300, w = 1500, h = 1500)
plot(op.p, 2,
morpho = TRUE,
ellipses = TRUE,
cex = 1,
cex.labelsgroups = 1.5)
dev.off()
plot(PCA(op), 2, pos.shp="full")
plot(PCA(op), 2, pos.shp="range")
plot(PCA(op), 2, pos.shp="xy")
plot(PCA(op), 2, pos.shp="range_axes")
plot(PCA(op), 2, pos.shp="full_axes")
# the contributions of each PC to the shapes
pc_contrib <- PCcontrib(op.p, nax = 1:3)
pc_contrib$gg +
geom_polygon(fill="slategrey", col="black") +
ggtitle("Shape variation along PC axes") +
theme_minimal(base_size = 6)
ggsave("Pottery_outlines/pca-contrib.png")
scree_plot(op.p)
# PC1 is height of vessel
# PC2 is neck constriction
# PC3 is height of neck and roundness of body
#-----------------------------------------------------------
# lda
# we performed a linear discriminant analysis (LDA) based on the new shape variables (PCs), with a leave-one-out cross-validation procedure, to identify the linear combination of shape features that was able to discriminate between phases
# we work on PCA scores
bot.l <- LDA(op.p, 2)
# print a summary, along with the leave-one-out cross-validation table.
bot.l
# plot.LDA works pretty much with the same grammar as plot.PCA
# here we only have one LD
plot(bot.l)
# plot the cross-validation table
plot_CV(bot.l)
# Does not produce an efficient separation between groups and a cross-validation (leave-one-out) at phase level showing a low confidence in the reclassification. But best for Pre-e, indicating these have the most distinctive shapes
#--------------------------------------------------
# Unsupervised model-based clustering
# what are the best grouping of the pots by shape?
library(mclust)
BIC <- mclustBIC(op.p$x[ , 1:5])
mod1 <- Mclust(op.p$x[ , 1:5], x = BIC)
summary(mod1, parameters = TRUE)
plot(mod1, what = "classification")
# no clusters apparent
#----------------------------------------------------
# MANOVA: Multivariate Analysis of (co)variace
# We can test for a difference in the distribution of PC scores by calculating a pairwise combination between every levels of a fac.
manv <- MANOVA_PW(op.p, "phase")
manv$summary
# shows sig diff in shape for all but ch-con vs. post-e
# $summary
# Df Pillai approx F num Df den Df
# ch-con - post-e 1 0.3900054 1.685582 11 29
# ch-con - pre-e 1 0.7083273 7.506274 11 34
# post-e - pre-e 1 0.3677487 2.485230 11 47
# Pr(>F)
# ch-con - post-e 1.270691e-01
# ch-con - pre-e 2.411143e-06
# post-e - pre-e 1.506824e-02
#----------------------------------------------------
# GAMMS from https://www-nature-com.offcampus.lib.washington.edu/articles/s41598-018-20122-9
# Generalised additive mixed models (GAMMs) were used to explain shape variance with respect to mean environmental parameters and shell size, and to compare between individual shape features (PCs). A GAMM is a generalised linear mixed model with a linear predictor involving a sum of smooth functions of covariates. The model allowed for flexible specification of the dependence of the response on the covariates by defining regression splines (smoothing functions) and estimating their optimal degree of smoothness, rather than calculating parametric relationships
ggplot(pottery_data,
aes(phase,
as.numeric(Thick1_Neck))) +
geom_boxplot()
pottery_data$thick_neck1 <- as.numeric(pottery_data$Thick1_Neck)
pottery_data$thick_body1 <- as.numeric(pottery_data$Thick1_Body)
pottery_data$height <- as.numeric(pottery_data$Hight)
plot(op.p$x[ , 2], pottery_data$thick_body1)
gams_input <- data.frame(thick_body1 = pottery_data$thick_body1 ,
height = pottery_data$height,
shape_pc1 = op.p$x[ , 1])
library(brms)
m2 <- brm(bf(shape_pc1 ~ s(thick_body1)),
data = gams_input, family = gaussian(), cores = 4, seed = 17,
iter = 4000, warmup = 1000, thin = 10, refresh = 0,
control = list(adapt_delta = 0.99))
summary(m2) # so many zeros... not very promising
# plot marginal effects for each predictor
plot(marginal_effects(m2), ask = FALSE)
loo(m2)
pp_check(m2)
#----------------------------------------------------
# Spatial
#----------------------------------------------------
# CVs on shape PCs for standardisation
# plot PC1-3 distribution by phase
pc_plot <- data_frame(pc = c(op.p$x[,1],
op.p$x[,2],
op.p$x[,3]),
phase = rep(op.p$fac$phase, 3),
pcn = str_glue('PC{unlist(map(1:3, ~rep(.x, length(op.p$x[,1]))))}'))
ggplot(pc_plot,
aes(phase,
pc)) +
geom_violin() +
ggforce::geom_sina() +
facet_wrap(~ pcn, ncol = 1) +
theme_minimal()
ggsave("Pottery_outlines/pcs-boxplot-for-shape-variation.png",
h = 10, w = 5)
library(cvequality)
pc1 <- data.frame(
x = op.p$x[,1],
y = as.character(op.p$fac$phase),
stringsAsFactors = F)
cv_test_pc1 <- mslr_test(nr = 1000, pc1$x, pc1$y)
pc2 <- data.frame(
x = op.p$x[,2],
y = as.character(op.p$fac$phase),
stringsAsFactors = F)
cv_test_pc2 <- mslr_test(nr = 1000, pc2$x, pc2$y)
pc3 <- data.frame(
x = op.p$x[,3],
y = as.character(op.p$fac$phase),
stringsAsFactors = F)
cv_test_pc3 <- mslr_test(nr = 1000, pc3$x, pc3$y)
# do this pairwise
cv_test_pc1.1 <-
mslr_test(nr = 1000,
pc1$x[pc1$y %in% c("Post-European",
"Pre-European")],
pc1$y[pc1$y %in% c("Post-European",
"Pre-European")])
cv_test_pc1.2 <-
mslr_test(nr = 1000,
pc1$x[pc1$y %in% c("Chinese Contact",
"Pre-European")],
pc1$y[pc1$y %in% c("Chinese Contact",
"Pre-European")])
cv_test_pc1.3 <-
mslr_test(nr = 1000,
pc1$x[pc1$y %in% c("Chinese Contact",
"Post-European")],
pc1$y[pc1$y %in% c("Chinese Contact",
"Post-European")])
PC1_cv_tests <-
bind_rows(cv_test_pc1.1, cv_test_pc1.2, cv_test_pc1.3) %>%
bind_cols(phases = c("Post-European vs Pre-European",
"Chinese Contact vs Pre-European",
"Chinese Contact vs Post-European"))
cv_test_pc2.1 <-
mslr_test(nr = 1000,
pc2$x[pc2$y %in% c("Post-European",
"Pre-European")],
pc2$y[pc2$y %in% c("Post-European",
"Pre-European")])
cv_test_pc2.2 <-
mslr_test(nr = 1000,
pc2$x[pc2$y %in% c("Chinese Contact",
"Pre-European")],
pc2$y[pc2$y %in% c("Chinese Contact",
"Pre-European")])
cv_test_pc2.3 <-
mslr_test(nr = 1000,
pc2$x[pc2$y %in% c("Chinese Contact",
"Post-European")],
pc2$y[pc2$y %in% c("Chinese Contact",
"Post-European")])
PC2_cv_tests <-
bind_rows(cv_test_pc2.1,
cv_test_pc2.2,
cv_test_pc2.3) %>%
bind_cols(phases = c("Post-European vs Pre-European",
"Chinese Contact vs Pre-European",
"Chinese Contact vs Post-European"))
bind_rows(PC1_cv_tests,
PC2_cv_tests, .id = "PC") %>%
mutate(PC = str_glue('PC{PC}')) %>%
mutate(colour = if_else(p_value < 0.05, "red", "black")) %>%
ggplot(aes(str_wrap(phases, 40),
p_value,
colour = I(colour))) +
geom_point(size = 8) +
facet_wrap(~ PC, ncol = 1) +
theme_minimal(base_size = 12) +
coord_cartesian(clip = 'off') +
xlab("") +
ylab("CV equality test p-values")
ggsave("Pottery_outlines/cv-test-on-pcs.png", h = 3, w = 12)
@benmarwick
Copy link
Author

rplot01

@benmarwick
Copy link
Author

output from pca-pottery-outlines.R

image

image

image

image

image

image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment