trinker · January 27, 2018 20:32
diff --git a/transcript_reading.R b/transcript_reading.R
 ## From: https://stackoverflow.com/q/48479535/1000343

 ## First part is just to make an external text file like the OP provided.

 x <- c("Month march ", "MARK ", "good in mathematic ", "JOE", "he need  help in language ", 
 "SUZANA ", "he is  good  in mathematic", "MARY ", "she has  ti work hard  this month ", 
 "", "", "Month April  ", "MARK ", "good in language ", "JOE", 
 "he need  help in mathematics ", "SUZANA ", "he is  good  in history ", 
 "MARY ", "she need  help ")

 cat(x, file = 'convo.txt', sep = '\n')

 ## Install dependencies
 install.github(file.path('trinker', c('textclean', 'textshape', 'lexicon'))
 library(textshape)
 library(textclean)

 ## Read in the data
 txt <- readLines('convo.txt')

 ## reshaping and cleaning via textshape & textclean
 out <- txt %>%
    split_match_regex('(?i)^month\\s+\\w+\\s*$', include = FALSE) %>%
    setNames(
        txt %>%
            keep_element_regex('(?i)^month\\s+\\w+\\s*$') %>%
            trimws() %>%
            tolower() %>%
            mgsub('month ', '')
    ) %>%
    lapply(function(x) {

        x %>% 
            trimws() %>%
            split_match_regex_to_transcript("^[A-Z]{3,}")
            
    }) %>%
    tidy_list('Month')
    
 out
               
 ##    Month Person                        Dialogue
 ## 1: march   MARK              good in mathematic
 ## 2: march    JOE        he need help in language
 ## 3: march SUZANA        he is good in mathematic
 ## 4: march   MARY she has ti work hard this month
 ## 5: april   MARK                good in language
 ## 6: april    JOE     he need help in mathematics
 ## 7: april SUZANA           he is good in history
 ## 8: april   MARY                   she need help
               
 ## Now one can filter and query to extract specific elements              
 library(dplyr)
 out %>%
    filter(Person == 'MARK')

 ##   Month Person           Dialogue
 ## 1 march   MARK good in mathematic
 ## 2 april   MARK   good in language
	## From: https://stackoverflow.com/q/48479535/1000343

	## First part is just to make an external text file like the OP provided.

	x <- c("Month march ", "MARK ", "good in mathematic ", "JOE", "he need help in language ",
	"SUZANA ", "he is good in mathematic", "MARY ", "she has ti work hard this month ",
	"", "", "Month April ", "MARK ", "good in language ", "JOE",
	"he need help in mathematics ", "SUZANA ", "he is good in history ",
	"MARY ", "she need help ")

	cat(x, file = 'convo.txt', sep = '\n')

	## Install dependencies
	install.github(file.path('trinker', c('textclean', 'textshape', 'lexicon'))
	library(textshape)
	library(textclean)

	## Read in the data
	txt <- readLines('convo.txt')

	## reshaping and cleaning via textshape & textclean
	out <- txt %>%
	split_match_regex('(?i)^month\\s+\\w+\\s*$', include = FALSE) %>%
	setNames(
	txt %>%
	keep_element_regex('(?i)^month\\s+\\w+\\s*$') %>%
	trimws() %>%
	tolower() %>%
	mgsub('month ', '')
	) %>%
	lapply(function(x) {

	x %>%
	trimws() %>%
	split_match_regex_to_transcript("^[A-Z]{3,}")

	}) %>%
	tidy_list('Month')

	out

	## Month Person Dialogue
	## 1: march MARK good in mathematic
	## 2: march JOE he need help in language
	## 3: march SUZANA he is good in mathematic
	## 4: march MARY she has ti work hard this month
	## 5: april MARK good in language
	## 6: april JOE he need help in mathematics
	## 7: april SUZANA he is good in history
	## 8: april MARY she need help

	## Now one can filter and query to extract specific elements
	library(dplyr)
	out %>%
	filter(Person == 'MARK')

	## Month Person Dialogue
	## 1 march MARK good in mathematic
	## 2 april MARK good in language