Last active
November 24, 2020 15:21
-
-
Save gongcastro/c751a62245b17562e18d8a145d5950c6 to your computer and use it in GitHub Desktop.
Compute pairwise orthographic and phonological distance across translation equivalents of "mask"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#### mask ---------------------------------------------------------------------- | |
# import packages | |
library(tidyverse) | |
library(readxl) | |
library(stringdist) | |
#### import data --------------------------------------------------------------- | |
# you can download the data from https://drive.google.com/file/d/18SeJTiM2-JXR9SOqEg22wdkvNL3OxG3u/view?usp=sharing | |
dat <- read_xlsx("mask.xlsx") %>% | |
mutate(roman = as.logical(roman), | |
romanisation = str_to_lower(romanisation), | |
label = paste0(language, ": ", form, " (", romanisation, ")"), | |
label_phon = paste0(language, ": ", form, " /", ipa_flat, "/")) %>% | |
drop_na(form) %>% | |
filter(region %in% c("Europe", "Asia", "Africa", "Oceania"), # other regions only provide missing values | |
language != "Hmong") %>% # outlier, makes any other scores seem too close to each other (see masks.xlsx) | |
as_tibble() | |
#### orthographic distance ----------------------------------------------------- | |
# generate all pairwise combinations of orthographic forms and compute their Levenshtein distance | |
expanded_ort <- expand_grid(label1 = dat$label, | |
label2 = dat$label) %>% | |
left_join(., { | |
dat %>% | |
select(label, romanisation) %>% | |
drop_na(romanisation) %>% | |
rename_all(function(x) paste0(x, "1")) | |
}) %>% | |
left_join(., { | |
dat %>% | |
select(label, romanisation) %>% | |
drop_na(romanisation) %>% | |
rename_all(function(x) paste0(x, "2")) | |
}) %>% | |
rowwise() %>% | |
mutate(n = ifelse(nchar(romanisation1) > nchar(romanisation2), nchar(romanisation1), nchar(romanisation2)), | |
lv = stringdist(romanisation1, romanisation2, method = "lv"), | |
lv_norm = lv/n) %>% | |
ungroup() | |
# plot data | |
ggplot(expanded_ort, aes(fct_reorder(label1, lv_norm), | |
fct_reorder(label2, lv_norm), | |
fill = lv_norm)) + | |
geom_tile() + | |
labs(fill = "Levenshtein distance", | |
title = "Orthographic distance between romanisations of translations of MASK", | |
subtitle = paste0("N = ", sum(!is.na(dat$romanisation), na.rm = TRUE)), | |
caption = "Levenshtein distance: Number of insertions, deletions, and replacements a character string\nhas to go trough to be identical to other | @gongcastro") + | |
scale_fill_distiller(palette = "Blues") + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1), | |
axis.title = element_blank(), | |
axis.text = element_text(colour = "black"), | |
plot.title = element_text(size = 25), | |
plot.subtitle = element_text(size = 20), | |
plot.caption = element_text(hjust = 0, size = 15), | |
legend.position = "top", | |
legend.title = element_text(face = "bold", size = 20), | |
legend.text = element_text(size = 15), | |
legend.key.size = unit(1.5, "cm")) + | |
ggsave("masks_ort.png", height = 18, width = 15) | |
#### phonological distance ----------------------------------------------------- | |
# generate all pairwise combinations of phonological forms and compute their Levenshtein distance | |
expanded_phon <- expand_grid(label_phon1 = dat$label_phon, label_phon2 = dat$label_phon) %>% | |
left_join(., { | |
dat %>% | |
select(label_phon, ipa_flat) %>% | |
drop_na(ipa_flat) %>% | |
rename_all(function(x) paste0(x, "1")) | |
}) %>% | |
left_join(., { | |
dat %>% | |
select(label_phon, ipa_flat) %>% | |
drop_na(ipa_flat) %>% | |
rename_all(function(x) paste0(x, "2")) | |
}) %>% | |
rowwise() %>% | |
mutate(n = ifelse(nchar(ipa_flat1) > nchar(ipa_flat2), nchar(ipa_flat1), nchar(ipa_flat2)), | |
lv_phon = stringdist(ipa_flat1, ipa_flat2, method = "lv"), | |
lv_phon_norm = lv_phon/n) %>% | |
ungroup() %>% | |
drop_na(lv_phon_norm) | |
# plot data | |
ggplot(expanded_phon, aes(fct_reorder(label_phon1, lv_phon_norm), | |
fct_reorder(label_phon2, lv_phon_norm), | |
fill = lv_phon_norm)) + | |
geom_tile() + | |
labs(fill = "Levenshtein distance", | |
title = "Phonological distance between IPA transcriptions of translations of MASK", | |
subtitle = paste0("N = ", sum(!is.na(dat$ipa_flat), na.rm = TRUE)), | |
caption = "Levenshtein distance: Number of insertions, deletions, and replacements a character string\nhas to go trough to be identical to other | @gongcastro") + | |
scale_fill_distiller(palette = "YlOrRd") + | |
theme_minimal() + | |
theme(axis.text.x = element_text(angle = 90, hjust = 1), | |
axis.title = element_blank(), | |
axis.text = element_text(colour = "black"), | |
plot.title = element_text(size = 25), | |
plot.subtitle = element_text(size = 20), | |
plot.caption = element_text(hjust = 0, size = 15), | |
legend.position = "top", | |
legend.title = element_text(face = "bold", size = 20), | |
legend.text = element_text(size = 15), | |
legend.key.size = unit(1.5, "cm")) + | |
ggsave("masks_phon.png", height = 18, width = 15) | |
#### onset --------------------------------------------------------------------- | |
# compute how many orthographic/phonological forms share onset | |
onsets <- list( | |
Orthography = dat %>% | |
mutate(onset = as.factor(substr(romanisation, 1, 1))) %>% | |
count(onset) %>% | |
mutate(prop = n/sum(.$n)), | |
Phonology = dat %>% | |
drop_na(ipa_flat) %>% | |
mutate(onset = as.factor(substr(ipa_flat, 1, 1))) %>% | |
count(onset) %>% | |
mutate(prop = n/sum(.$n), | |
onset = paste0("/", onset, "/")) | |
) %>% | |
bind_rows(.id = "type") %>% | |
mutate(type = ifelse( | |
type=="Orthography", | |
paste0(type, " (N = ", sum(!is.na(dat$romanisation), na.rm = TRUE), ")"), | |
paste0(type, " (N = ", sum(!is.na(dat$ipa_flat), na.rm = TRUE), ")") | |
)) %>% | |
group_by(type) %>% | |
arrange(desc(prop)) %>% | |
mutate(index = row_number()) %>% | |
ungroup() | |
# plot data | |
ggplot(onsets, aes(x = fct_reorder(onset, prop, .desc = TRUE), | |
y = prop, | |
order = -prop, | |
label = n, | |
fill = index)) + | |
facet_wrap(~type, scales = "free", dir = "v") + | |
geom_col(colour = "white", width = 1, show.legend = FALSE) + | |
geom_text(nudge_y = 0.025) + | |
scale_fill_distiller(palette = "YlOrRd") + | |
scale_y_continuous(labels = scales::percent) + | |
labs(x = "Onset", | |
y = "%", | |
fill = "Region", | |
title = "Shared onset", | |
subtitle = "How often do translations of MASK start with the same letter/phoneme?", | |
caption = "@gongcastro") + | |
theme_minimal() + | |
theme(axis.text.x = element_text(colour = "black", face = "bold", size = 12), | |
axis.text.y = element_text(colour = "black", size = 12), | |
axis.title = element_blank(), | |
strip.background = element_rect(fill = "grey", colour = NA), | |
strip.text = element_text(size = 12, face = "bold"), | |
legend.title = element_text(face = "bold"), | |
panel.grid.major.x = element_blank(), | |
plot.caption.position = "plot", | |
plot.title = element_text(face = "bold", size = 15), | |
plot.caption = element_text(hjust = 0)) + | |
ggsave("masks_onset.png", width = 6, height = 6) | |
Author
gongcastro
commented
Nov 24, 2020
- Orthographic Levenshtein distance:
- Phonological Levenshtein distance:
- Shared orthographic and phonological onsets:
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment