Last active
January 27, 2018 20:32
-
-
Save trinker/c5fe10b14825be9676c7b7308dde845c to your computer and use it in GitHub Desktop.
Reading trascripts with textreadr & textshape packages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## From: https://stackoverflow.com/q/48479535/1000343 | |
## First part is just to make an external text file like the OP provided. | |
x <- c("Month march ", "MARK ", "good in mathematic ", "JOE", "he need help in language ", | |
"SUZANA ", "he is good in mathematic", "MARY ", "she has ti work hard this month ", | |
"", "", "Month April ", "MARK ", "good in language ", "JOE", | |
"he need help in mathematics ", "SUZANA ", "he is good in history ", | |
"MARY ", "she need help ") | |
cat(x, file = 'convo.txt', sep = '\n') | |
## Install dependencies | |
install.github(file.path('trinker', c('textclean', 'textshape', 'lexicon')) | |
library(textshape) | |
library(textclean) | |
## Read in the data | |
txt <- readLines('convo.txt') | |
## reshaping and cleaning via textshape & textclean | |
out <- txt %>% | |
split_match_regex('(?i)^month\\s+\\w+\\s*$', include = FALSE) %>% | |
setNames( | |
txt %>% | |
keep_element_regex('(?i)^month\\s+\\w+\\s*$') %>% | |
trimws() %>% | |
tolower() %>% | |
mgsub('month ', '') | |
) %>% | |
lapply(function(x) { | |
x %>% | |
trimws() %>% | |
split_match_regex_to_transcript("^[A-Z]{3,}") | |
}) %>% | |
tidy_list('Month') | |
out | |
## Month Person Dialogue | |
## 1: march MARK good in mathematic | |
## 2: march JOE he need help in language | |
## 3: march SUZANA he is good in mathematic | |
## 4: march MARY she has ti work hard this month | |
## 5: april MARK good in language | |
## 6: april JOE he need help in mathematics | |
## 7: april SUZANA he is good in history | |
## 8: april MARY she need help | |
## Now one can filter and query to extract specific elements | |
library(dplyr) | |
out %>% | |
filter(Person == 'MARK') | |
## Month Person Dialogue | |
## 1 march MARK good in mathematic | |
## 2 april MARK good in language |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment