macleginn · January 14, 2019 15:36
diff --git a/PDPROTO_v_PHOIBLE.r b/PDPROTO_v_PHOIBLE.r
 proto.data <- read.csv('bdproto.csv',
                       sep = ',')

 ## Clean the data
 p.d <- proto.data[ !is.na(proto.data$LanguageFamilyRoot) &
                   proto.data$LanguageFamilyRoot != '' &
                   !is.na(proto.data$LanguageName) &
                   proto.data$LanguageName != '', ]

 ## People mostly reconstruct weird stuff:
 ## > quantile(table(p.d$Phoneme))
 ##   0%  25%  50%  75% 100% 
 ##    0    1    1    4  205 
 ## > quantile(table(p.d$Phoneme)[table(p.d$Phoneme) >= 10])
 ##   0%  25%  50%  75% 100% 
 ##   10   13   28   73  205 

 families <- unique(p.d$LanguageFamilyRoot)
 phonemes <- names(table(p.d$Phoneme)[table(p.d$Phoneme) >= 5])

 ## Bootstrap resampling
 ## One random language per family
 phonemes.col <- rep('', length(phonemes)*1000*2)
 frequencies.col <- rep(0, length(phonemes)*1000*2)
 datasets.col <- c(
    rep('bdproto', length(phonemes)*1000),
    rep('phoible', length(phonemes)*1000)
 )
 print("BDPROTO")
 for (i in 1:1000) {
    if (i %% 50 == 0)
        print(i)
    ## Create keys for stratified resampling
    keys <- c()
    for (f in families) {
        protolang <- sample(unique( p.d[p.d$LanguageFamilyRoot == f, 'LanguageName'] ),
                            size = 1)
        keys <- c(keys,
                  paste0(as.character(f),
                         as.character(protolang)))
    }
    replica.df <- droplevels(p.d[ paste0(p.d$LanguageFamilyRoot,
                                         p.d$LanguageName) %in% keys, 'Phoneme' ])
    freqs <- table(replica.df)
    for (j in 1:length(phonemes)) {
        p = phonemes[j]
        idx = (i-1)*length(phonemes) + j
        phonemes.col[idx] = p
        if (is.na(freqs[p])) {
            frequencies.col[idx] = 0
        } else {
            frequencies.col[idx] = freqs[p] / length(families)
        }
    }
 }
 ## Do the same for PHOIBLE
 phoible.dir <- 'XXX'
 phoible.meta <- read.csv(paste0(phoible.dir, 'phoible-aggregated.tsv'),
                         sep = '\t')
 phoible.phons <- read.csv(paste0(phoible.dir, 'phoible-phonemes.tsv'),
                          sep = '\t')
 ## Phonemes are the same; families are different
 phoible.families <- unique(phoible.meta$LanguageFamilyRoot)
 print("PHOIBLE")
 for (i in 1:1000) {
    if (i %% 50 == 0)
        print(i)
    ## Create keys for stratified resampling
    ## Use inventory IDs instead of language names
    keys <- c()
    for (f in phoible.families) {
        inv.id <- sample(unique( phoible.meta[phoible.meta$LanguageFamilyRoot == f,
                                              'InventoryID'] ),
                         size = 1)
        keys <- c(keys, inv.id)
    }
    replica.df <- droplevels(phoible.phons[ phoible.phons$InventoryID %in% keys, 'Phoneme' ])
    freqs <- table(replica.df)
    for (j in 1:length(phonemes)) {
        p = phonemes[j]
        idx = length(phonemes)*1000 + (i-1)*length(phonemes) + j
        phonemes.col[idx] = p
        if (is.na(freqs[p])) {
            frequencies.col[idx] = 0
        } else {
            frequencies.col[idx] = freqs[p] / length(phoible.families)
        }
    }
 }
 boot.df <- data.frame(
    phoneme = phonemes.col,
    frequency = frequencies.col,
    dataset = datasets.col
 )


 basic.phons <- c('p', 't', 'k', 'f', 'b', 'd', 'ɡ', 'v')

 library(ggplot2)

 ggplot(
    aes(x = phoneme, y = frequency, fill = dataset),
    data = droplevels(boot.df[ boot.df$phoneme %in% basic.phons, ])
 ) + geom_boxplot() + theme_bw() + 
    theme(axis.text.x = element_text(size=13))

 median.difs <- c()
 for (p in phonemes) {
    freqs.proto <- boot.df[(boot.df$phoneme == p) & (boot.df$dataset == 'bdproto'),
                           'frequency']
    freqs.phoible <- boot.df[(boot.df$phoneme == p) & (boot.df$dataset == 'phoible'),
                             'frequency']
    dif <- median(freqs.phoible) - median(freqs.proto)
    median.difs[p] <- dif
 }
 top5 <- head(names(sort(median.difs, decreasing = T)),
             n = 5)

 ggplot(
    aes(x = phoneme, y = frequency, fill = dataset),
    data = droplevels(boot.df[ boot.df$phoneme %in% top5, ])
 ) + geom_boxplot() + theme_bw() + 
    theme(axis.text.x = element_text(size=13))

 current.borrowings <- scan('current_borrowings.dat', what = character(), sep = ',')
 head(sort(table(current.borrowings), decreasing = T),
     n = 20)

 current.borrowings[current.borrowings=='tʃ'] = 't̠ʃ'
 current.borrowings[current.borrowings=='dʒ'] = 'd̠ʒ'
 current.borrowings[current.borrowings=='g'] = 'ɡ'

 borrowed.and.reconstructed <- intersect(phonemes,
                                        current.borrowings)

 x <- as.numeric(table(current.borrowings)[borrowed.and.reconstructed])
 y <- median.difs[borrowed.and.reconstructed]
 scatter.smooth(y~x)
	proto.data <- read.csv('bdproto.csv',
	sep = ',')

	## Clean the data
	p.d <- proto.data[ !is.na(proto.data$LanguageFamilyRoot) &
	proto.data$LanguageFamilyRoot != '' &
	!is.na(proto.data$LanguageName) &
	proto.data$LanguageName != '', ]

	## People mostly reconstruct weird stuff:
	## > quantile(table(p.d$Phoneme))
	## 0% 25% 50% 75% 100%
	## 0 1 1 4 205
	## > quantile(table(p.d$Phoneme)[table(p.d$Phoneme) >= 10])
	## 0% 25% 50% 75% 100%
	## 10 13 28 73 205

	families <- unique(p.d$LanguageFamilyRoot)
	phonemes <- names(table(p.d$Phoneme)[table(p.d$Phoneme) >= 5])

	## Bootstrap resampling
	## One random language per family
	phonemes.col <- rep('', length(phonemes)10002)
	frequencies.col <- rep(0, length(phonemes)10002)
	datasets.col <- c(
	rep('bdproto', length(phonemes)*1000),
	rep('phoible', length(phonemes)*1000)
	)
	print("BDPROTO")
	for (i in 1:1000) {
	if (i %% 50 == 0)
	print(i)
	## Create keys for stratified resampling
	keys <- c()
	for (f in families) {
	protolang <- sample(unique( p.d[p.d$LanguageFamilyRoot == f, 'LanguageName'] ),
	size = 1)
	keys <- c(keys,
	paste0(as.character(f),
	as.character(protolang)))
	}
	replica.df <- droplevels(p.d[ paste0(p.d$LanguageFamilyRoot,
	p.d$LanguageName) %in% keys, 'Phoneme' ])
	freqs <- table(replica.df)
	for (j in 1:length(phonemes)) {
	p = phonemes[j]
	idx = (i-1)*length(phonemes) + j
	phonemes.col[idx] = p
	if (is.na(freqs[p])) {
	frequencies.col[idx] = 0
	} else {
	frequencies.col[idx] = freqs[p] / length(families)
	}
	}
	}
	## Do the same for PHOIBLE
	phoible.dir <- 'XXX'
	phoible.meta <- read.csv(paste0(phoible.dir, 'phoible-aggregated.tsv'),
	sep = '\t')
	phoible.phons <- read.csv(paste0(phoible.dir, 'phoible-phonemes.tsv'),
	sep = '\t')
	## Phonemes are the same; families are different
	phoible.families <- unique(phoible.meta$LanguageFamilyRoot)
	print("PHOIBLE")
	for (i in 1:1000) {
	if (i %% 50 == 0)
	print(i)
	## Create keys for stratified resampling
	## Use inventory IDs instead of language names
	keys <- c()
	for (f in phoible.families) {
	inv.id <- sample(unique( phoible.meta[phoible.meta$LanguageFamilyRoot == f,
	'InventoryID'] ),
	size = 1)
	keys <- c(keys, inv.id)
	}
	replica.df <- droplevels(phoible.phons[ phoible.phons$InventoryID %in% keys, 'Phoneme' ])
	freqs <- table(replica.df)
	for (j in 1:length(phonemes)) {
	p = phonemes[j]
	idx = length(phonemes)1000 + (i-1)length(phonemes) + j
	phonemes.col[idx] = p
	if (is.na(freqs[p])) {
	frequencies.col[idx] = 0
	} else {
	frequencies.col[idx] = freqs[p] / length(phoible.families)
	}
	}
	}
	boot.df <- data.frame(
	phoneme = phonemes.col,
	frequency = frequencies.col,
	dataset = datasets.col
	)


	basic.phons <- c('p', 't', 'k', 'f', 'b', 'd', 'ɡ', 'v')

	library(ggplot2)

	ggplot(
	aes(x = phoneme, y = frequency, fill = dataset),
	data = droplevels(boot.df[ boot.df$phoneme %in% basic.phons, ])
	) + geom_boxplot() + theme_bw() +
	theme(axis.text.x = element_text(size=13))

	median.difs <- c()
	for (p in phonemes) {
	freqs.proto <- boot.df[(boot.df$phoneme == p) & (boot.df$dataset == 'bdproto'),
	'frequency']
	freqs.phoible <- boot.df[(boot.df$phoneme == p) & (boot.df$dataset == 'phoible'),
	'frequency']
	dif <- median(freqs.phoible) - median(freqs.proto)
	median.difs[p] <- dif
	}
	top5 <- head(names(sort(median.difs, decreasing = T)),
	n = 5)

	ggplot(
	aes(x = phoneme, y = frequency, fill = dataset),
	data = droplevels(boot.df[ boot.df$phoneme %in% top5, ])
	) + geom_boxplot() + theme_bw() +
	theme(axis.text.x = element_text(size=13))

	current.borrowings <- scan('current_borrowings.dat', what = character(), sep = ',')
	head(sort(table(current.borrowings), decreasing = T),
	n = 20)

	current.borrowings[current.borrowings=='tʃ'] = 't̠ʃ'
	current.borrowings[current.borrowings=='dʒ'] = 'd̠ʒ'
	current.borrowings[current.borrowings=='g'] = 'ɡ'

	borrowed.and.reconstructed <- intersect(phonemes,
	current.borrowings)

	x <- as.numeric(table(current.borrowings)[borrowed.and.reconstructed])
	y <- median.difs[borrowed.and.reconstructed]
	scatter.smooth(y~x)
No results found