Last active
January 14, 2019 15:36
-
-
Save macleginn/d4d058c456f00771f74a629b8de1237c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| proto.data <- read.csv('bdproto.csv', | |
| sep = ',') | |
| ## Clean the data | |
| p.d <- proto.data[ !is.na(proto.data$LanguageFamilyRoot) & | |
| proto.data$LanguageFamilyRoot != '' & | |
| !is.na(proto.data$LanguageName) & | |
| proto.data$LanguageName != '', ] | |
| ## People mostly reconstruct weird stuff: | |
| ## > quantile(table(p.d$Phoneme)) | |
| ## 0% 25% 50% 75% 100% | |
| ## 0 1 1 4 205 | |
| ## > quantile(table(p.d$Phoneme)[table(p.d$Phoneme) >= 10]) | |
| ## 0% 25% 50% 75% 100% | |
| ## 10 13 28 73 205 | |
| families <- unique(p.d$LanguageFamilyRoot) | |
| phonemes <- names(table(p.d$Phoneme)[table(p.d$Phoneme) >= 5]) | |
| ## Bootstrap resampling | |
| ## One random language per family | |
| phonemes.col <- rep('', length(phonemes)*1000*2) | |
| frequencies.col <- rep(0, length(phonemes)*1000*2) | |
| datasets.col <- c( | |
| rep('bdproto', length(phonemes)*1000), | |
| rep('phoible', length(phonemes)*1000) | |
| ) | |
| print("BDPROTO") | |
| for (i in 1:1000) { | |
| if (i %% 50 == 0) | |
| print(i) | |
| ## Create keys for stratified resampling | |
| keys <- c() | |
| for (f in families) { | |
| protolang <- sample(unique( p.d[p.d$LanguageFamilyRoot == f, 'LanguageName'] ), | |
| size = 1) | |
| keys <- c(keys, | |
| paste0(as.character(f), | |
| as.character(protolang))) | |
| } | |
| replica.df <- droplevels(p.d[ paste0(p.d$LanguageFamilyRoot, | |
| p.d$LanguageName) %in% keys, 'Phoneme' ]) | |
| freqs <- table(replica.df) | |
| for (j in 1:length(phonemes)) { | |
| p = phonemes[j] | |
| idx = (i-1)*length(phonemes) + j | |
| phonemes.col[idx] = p | |
| if (is.na(freqs[p])) { | |
| frequencies.col[idx] = 0 | |
| } else { | |
| frequencies.col[idx] = freqs[p] / length(families) | |
| } | |
| } | |
| } | |
| ## Do the same for PHOIBLE | |
| phoible.dir <- 'XXX' | |
| phoible.meta <- read.csv(paste0(phoible.dir, 'phoible-aggregated.tsv'), | |
| sep = '\t') | |
| phoible.phons <- read.csv(paste0(phoible.dir, 'phoible-phonemes.tsv'), | |
| sep = '\t') | |
| ## Phonemes are the same; families are different | |
| phoible.families <- unique(phoible.meta$LanguageFamilyRoot) | |
| print("PHOIBLE") | |
| for (i in 1:1000) { | |
| if (i %% 50 == 0) | |
| print(i) | |
| ## Create keys for stratified resampling | |
| ## Use inventory IDs instead of language names | |
| keys <- c() | |
| for (f in phoible.families) { | |
| inv.id <- sample(unique( phoible.meta[phoible.meta$LanguageFamilyRoot == f, | |
| 'InventoryID'] ), | |
| size = 1) | |
| keys <- c(keys, inv.id) | |
| } | |
| replica.df <- droplevels(phoible.phons[ phoible.phons$InventoryID %in% keys, 'Phoneme' ]) | |
| freqs <- table(replica.df) | |
| for (j in 1:length(phonemes)) { | |
| p = phonemes[j] | |
| idx = length(phonemes)*1000 + (i-1)*length(phonemes) + j | |
| phonemes.col[idx] = p | |
| if (is.na(freqs[p])) { | |
| frequencies.col[idx] = 0 | |
| } else { | |
| frequencies.col[idx] = freqs[p] / length(phoible.families) | |
| } | |
| } | |
| } | |
| boot.df <- data.frame( | |
| phoneme = phonemes.col, | |
| frequency = frequencies.col, | |
| dataset = datasets.col | |
| ) | |
| basic.phons <- c('p', 't', 'k', 'f', 'b', 'd', 'ɡ', 'v') | |
| library(ggplot2) | |
| ggplot( | |
| aes(x = phoneme, y = frequency, fill = dataset), | |
| data = droplevels(boot.df[ boot.df$phoneme %in% basic.phons, ]) | |
| ) + geom_boxplot() + theme_bw() + | |
| theme(axis.text.x = element_text(size=13)) | |
| median.difs <- c() | |
| for (p in phonemes) { | |
| freqs.proto <- boot.df[(boot.df$phoneme == p) & (boot.df$dataset == 'bdproto'), | |
| 'frequency'] | |
| freqs.phoible <- boot.df[(boot.df$phoneme == p) & (boot.df$dataset == 'phoible'), | |
| 'frequency'] | |
| dif <- median(freqs.phoible) - median(freqs.proto) | |
| median.difs[p] <- dif | |
| } | |
| top5 <- head(names(sort(median.difs, decreasing = T)), | |
| n = 5) | |
| ggplot( | |
| aes(x = phoneme, y = frequency, fill = dataset), | |
| data = droplevels(boot.df[ boot.df$phoneme %in% top5, ]) | |
| ) + geom_boxplot() + theme_bw() + | |
| theme(axis.text.x = element_text(size=13)) | |
| current.borrowings <- scan('current_borrowings.dat', what = character(), sep = ',') | |
| head(sort(table(current.borrowings), decreasing = T), | |
| n = 20) | |
| current.borrowings[current.borrowings=='tʃ'] = 't̠ʃ' | |
| current.borrowings[current.borrowings=='dʒ'] = 'd̠ʒ' | |
| current.borrowings[current.borrowings=='g'] = 'ɡ' | |
| borrowed.and.reconstructed <- intersect(phonemes, | |
| current.borrowings) | |
| x <- as.numeric(table(current.borrowings)[borrowed.and.reconstructed]) | |
| y <- median.difs[borrowed.and.reconstructed] | |
| scatter.smooth(y~x) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment