Created
July 7, 2021 09:02
-
-
Save gongcastro/34b7131ed98a463da9d8d4c88a77e03b to your computer and use it in GitHub Desktop.
Custom function to extract lexical frequencies from the CHILDES corpora.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# extract lexical frequencies from CHILDES | |
# you may need to install the following packages: | |
# install.packages(c("dplyr", "stringr", "tidyr", "chidesr")) | |
get_childes_frequency <- function( | |
token, # word(s) form to look up, e.g. c("table", "mesa") | |
languages = c("cat", "spa"), # languages in which to look up the word form | |
... # other arguments (see ?childesr::get_speaker_statistics) | |
){ | |
library(dplyr) | |
library(stringr) | |
library(tidyr) | |
library(childesr) | |
# get total number of tokens in each language | |
total_counts <- get_speaker_statistics(...) %>% | |
filter(str_detect(language, paste(languages, collapse = "|"))) %>% | |
group_by(language) %>% | |
summarise(num_tokens = sum(num_tokens), .groups = "drop") %>% | |
mutate(language = str_split(language, " ")) %>% | |
unnest(cols = language) %>% | |
group_by(language) %>% | |
summarise(n = sum(num_tokens, na.rm = TRUE), .groups = "drop") | |
# absolute frequency (raw counts) | |
freq_counts <- get_tokens(role = "target_child", token = token, language = languages) %>% | |
mutate(gloss = str_to_lower(gloss)) %>% | |
filter(str_detect(language, paste(languages, collapse = "|"))) %>% | |
count(gloss, language) %>% | |
mutate(language = str_split(language, " ")) %>% | |
unnest(language) %>% | |
group_by(language, gloss) %>% | |
summarise(freq_counts = sum(n), .groups = "drop") %>% | |
filter(freq_counts>0) | |
# relative frequency (counts per million) | |
freq_million <- freq_counts %>% | |
left_join(total_counts, by = "language") %>% | |
mutate( | |
freq_per_million = freq_counts/n*1e6, | |
freq_zipf = log10(freq_per_million)+3 | |
) %>% | |
rename(word = gloss, test_language = language) %>% | |
select(word, test_language, starts_with("freq_")) | |
return(freq_million) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment