Skip to content

Instantly share code, notes, and snippets.

@trinker
Created March 22, 2019 19:14
Show Gist options
  • Select an option

  • Save trinker/ec317db814caeccf06fa6e3206d5a1b2 to your computer and use it in GitHub Desktop.

Select an option

Save trinker/ec317db814caeccf06fa6e3206d5a1b2 to your computer and use it in GitHub Desktop.
formality_with_udpipe
##==============================================================================
## Formality
##==============================================================================
## 1. tag parts of speech
## 2. convert to generic POS
## 3. COmpute formality off POS
udmodel <- udpipe::udpipe_download_model(language = "english")
udmodel <- udpipe::udpipe_load_model(file = udmodel$file_model)
cl::tic()
text.var <- 'OpenText'
annotated <- udpipe::udpipe_annotate(
udmodel,
x = dat[[text.var]],
parser = "none" ## cuts run time to < 1/3
)
cl::toc()
formality <- dat %>%
dplyr::select(-OpenText) %>%
dplyr::bind_cols(
as_formality(annotated)
) %>%
dplyr::mutate(
Id = openssl::sha2(paste0('formality', InstitutionId, ResponseId)) %>%
as.character()
) %>%
dplyr::select(Id, everything())
## Function to convert xpos to generic
library(data.table)
as_formality <- function(.data, ...) {
UseMethod('as_formality')
}
as_formality.udpipe_connlu <- function(.data, ...) {
.data <- as.data.frame(.data, stringsAsFactors = FALSE)
stopifnot(all(c('token', 'xpos') %in% colnames(.data)))
# .data <- tibble::as_tibble(.data)
#
# dplyr::left_join(.data, dplyr::select(.basic, -basic), by = 'xpos') %>%
# dplyr::select(doc_id, type) %>%
# filter(!is.na(type)) %>%
# dplyr::group_by(doc_id) %>%
# dplyr::summarize(
# TokenCount = n(),
# FormalCount = sum(type == 'formal'),
# ContextualCount = TokenCount - FormalCount,
# FormalRate = FormalCount/TokenCount,
# ContextualRate = ContextualCount/TokenCount
# )
dat <- data.table::data.table(data.table::copy(.data))[, c('doc_id', 'token', 'xpos'), with = FALSE]
data.table::setkey(dat, "xpos")
last <- unique(dat[, 'doc_id', with = FALSE])
data.table::setkey(last, "doc_id")
looked_up <- formal_map[dat][,
c('doc_id', 'type'), with = FALSE][
!is.na(type)][,
list(
TokenCount = .N,
FormalCount = sum(type == 'formal')
), by = 'doc_id'
][,
ContextualCount := TokenCount - FormalCount
][,
FormalRate := FormalCount/TokenCount
][,
ContextualRate := ContextualCount/TokenCount
][]
data.table::setkey(looked_up, "doc_id")
looked_up[last][, doc_id := as.integer(gsub('^doc', '', doc_id))][order(doc_id), ][, doc_id := NULL][]
}
## mapping of xpos to generic basic form of POS
.basic <- tibble::tribble(
~xpos, ~basic, ~type,
"!", ".", NA,
"#", ".", NA,
"$", ".", NA,
"''", ".", NA,
"(", ".", NA,
")", ".", NA,
",", ".", NA,
"-LRB-", ".", NA,
"-RRB-", ".", NA,
".", ".", NA,
":", ".", NA,
"?", ".", NA,
"CC", "conjunction", NA,
"CD", "adjective", "formal",
"CD|RB", "X", NA,
"DT", "adjective", "formal",
"EX", "noun", "formal",
"FW", "X", NA,
"IN", "preposition", "formal",
"IN|RP", "preposition", "formal",
"JJ", "adjective", "formal",
"JJR", "adjective", "formal",
"JJRJR", "adjective", "formal",
"JJS", "adjective", "formal",
"JJ|RB", "adjective", "formal",
"JJ|VBG", "adjective", "formal",
"LS", "X", NA,
"MD", "verb", "contextual",
"NN", "noun", "formal",
"NNP", "noun", "formal",
"NNPS", "noun", "formal",
"NNS", "noun", "formal",
"NN|NNS", "noun", "formal",
"NN|SYM", "noun", "formal",
"NN|VBG", "noun", "formal",
"NP", "noun", "formal",
"PDT", "adjective", "formal",
"POS", "X", NA,
"PRP", "pronoun", "contextual",
"PRP$", "pronoun", "contextual",
"PRP|VBP", "pronoun", "contextual",
"PRT", "preposition", "formal",
"RB", "adverb", "contextual",
"RBR", "adverb", "contextual",
"RBS", "adverb", "contextual",
"RB|RP", "adverb", "contextual",
"RB|VBG", "adverb", "contextual",
"RN", "X", NA,
"RP", "preposition", "formal",
"SYM", "X", NA,
"TO", "preposition", "formal",
"UH", "interjection", "contextual",
"VB", "verb", "contextual",
"VBD", "verb", "contextual",
"VBD|VBN", "verb", "contextual",
"VBG", "verb", "contextual",
"VBG|NN", "verb", "contextual",
"VBN", "verb", "contextual",
"VBP", "verb", "contextual",
"VBP|TO", "verb", "contextual",
"VBZ", "verb", "contextual",
"VP", "verb", "contextual",
"WDT", "pronoun", "contextual",
"WH", "X", NA,
"WP", "pronoun", "contextual",
"WP$", "pronoun", "contextual",
"WRB", "adverb", "contextual",
"``", ".", NA,
"article", "article", "formal"
)
formal_map <- data.table::data.table(data.table::copy(.basic))[, basic := NULL][!is.na(type),][]
data.table::setkey(formal_map, "xpos")
##==============================================================================
## Function for making snake into camel case
snake2camel <- function (x, ...) {
gsub("(^[a-z])", "\\U\\1", gsub("(_)([a-z])", "\\U\\2", x,
perl = TRUE), perl = TRUE)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment