Skip to content

Instantly share code, notes, and snippets.

@macleginn
Last active January 14, 2019 15:36
Show Gist options
  • Select an option

  • Save macleginn/d4d058c456f00771f74a629b8de1237c to your computer and use it in GitHub Desktop.

Select an option

Save macleginn/d4d058c456f00771f74a629b8de1237c to your computer and use it in GitHub Desktop.
proto.data <- read.csv('bdproto.csv',
sep = ',')
## Clean the data
p.d <- proto.data[ !is.na(proto.data$LanguageFamilyRoot) &
proto.data$LanguageFamilyRoot != '' &
!is.na(proto.data$LanguageName) &
proto.data$LanguageName != '', ]
## People mostly reconstruct weird stuff:
## > quantile(table(p.d$Phoneme))
## 0% 25% 50% 75% 100%
## 0 1 1 4 205
## > quantile(table(p.d$Phoneme)[table(p.d$Phoneme) >= 10])
## 0% 25% 50% 75% 100%
## 10 13 28 73 205
families <- unique(p.d$LanguageFamilyRoot)
phonemes <- names(table(p.d$Phoneme)[table(p.d$Phoneme) >= 5])
## Bootstrap resampling
## One random language per family
phonemes.col <- rep('', length(phonemes)*1000*2)
frequencies.col <- rep(0, length(phonemes)*1000*2)
datasets.col <- c(
rep('bdproto', length(phonemes)*1000),
rep('phoible', length(phonemes)*1000)
)
print("BDPROTO")
for (i in 1:1000) {
if (i %% 50 == 0)
print(i)
## Create keys for stratified resampling
keys <- c()
for (f in families) {
protolang <- sample(unique( p.d[p.d$LanguageFamilyRoot == f, 'LanguageName'] ),
size = 1)
keys <- c(keys,
paste0(as.character(f),
as.character(protolang)))
}
replica.df <- droplevels(p.d[ paste0(p.d$LanguageFamilyRoot,
p.d$LanguageName) %in% keys, 'Phoneme' ])
freqs <- table(replica.df)
for (j in 1:length(phonemes)) {
p = phonemes[j]
idx = (i-1)*length(phonemes) + j
phonemes.col[idx] = p
if (is.na(freqs[p])) {
frequencies.col[idx] = 0
} else {
frequencies.col[idx] = freqs[p] / length(families)
}
}
}
## Do the same for PHOIBLE
phoible.dir <- 'XXX'
phoible.meta <- read.csv(paste0(phoible.dir, 'phoible-aggregated.tsv'),
sep = '\t')
phoible.phons <- read.csv(paste0(phoible.dir, 'phoible-phonemes.tsv'),
sep = '\t')
## Phonemes are the same; families are different
phoible.families <- unique(phoible.meta$LanguageFamilyRoot)
print("PHOIBLE")
for (i in 1:1000) {
if (i %% 50 == 0)
print(i)
## Create keys for stratified resampling
## Use inventory IDs instead of language names
keys <- c()
for (f in phoible.families) {
inv.id <- sample(unique( phoible.meta[phoible.meta$LanguageFamilyRoot == f,
'InventoryID'] ),
size = 1)
keys <- c(keys, inv.id)
}
replica.df <- droplevels(phoible.phons[ phoible.phons$InventoryID %in% keys, 'Phoneme' ])
freqs <- table(replica.df)
for (j in 1:length(phonemes)) {
p = phonemes[j]
idx = length(phonemes)*1000 + (i-1)*length(phonemes) + j
phonemes.col[idx] = p
if (is.na(freqs[p])) {
frequencies.col[idx] = 0
} else {
frequencies.col[idx] = freqs[p] / length(phoible.families)
}
}
}
boot.df <- data.frame(
phoneme = phonemes.col,
frequency = frequencies.col,
dataset = datasets.col
)
basic.phons <- c('p', 't', 'k', 'f', 'b', 'd', 'ɡ', 'v')
library(ggplot2)
ggplot(
aes(x = phoneme, y = frequency, fill = dataset),
data = droplevels(boot.df[ boot.df$phoneme %in% basic.phons, ])
) + geom_boxplot() + theme_bw() +
theme(axis.text.x = element_text(size=13))
median.difs <- c()
for (p in phonemes) {
freqs.proto <- boot.df[(boot.df$phoneme == p) & (boot.df$dataset == 'bdproto'),
'frequency']
freqs.phoible <- boot.df[(boot.df$phoneme == p) & (boot.df$dataset == 'phoible'),
'frequency']
dif <- median(freqs.phoible) - median(freqs.proto)
median.difs[p] <- dif
}
top5 <- head(names(sort(median.difs, decreasing = T)),
n = 5)
ggplot(
aes(x = phoneme, y = frequency, fill = dataset),
data = droplevels(boot.df[ boot.df$phoneme %in% top5, ])
) + geom_boxplot() + theme_bw() +
theme(axis.text.x = element_text(size=13))
current.borrowings <- scan('current_borrowings.dat', what = character(), sep = ',')
head(sort(table(current.borrowings), decreasing = T),
n = 20)
current.borrowings[current.borrowings=='tʃ'] = 't̠ʃ'
current.borrowings[current.borrowings=='dʒ'] = 'd̠ʒ'
current.borrowings[current.borrowings=='g'] = 'ɡ'
borrowed.and.reconstructed <- intersect(phonemes,
current.borrowings)
x <- as.numeric(table(current.borrowings)[borrowed.and.reconstructed])
y <- median.difs[borrowed.and.reconstructed]
scatter.smooth(y~x)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment