Skip to content

Instantly share code, notes, and snippets.

@erdavis1
Created May 21, 2020 19:32
Show Gist options
  • Save erdavis1/e4d24c8acf736ccb6fbf96272937a096 to your computer and use it in GitHub Desktop.
Save erdavis1/e4d24c8acf736ccb6fbf96272937a096 to your computer and use it in GitHub Desktop.
Process all books code outline
library(reticulate)
library(cleanNLP)
library(dplyr)
library(stringr)
library(tidyr)
library(textstem)
setwd("C:/Users/Erin/Documents/DataViz/Adjectives/")
options(stringsAsFactors = FALSE)
#-----spaCy
use_python("C:/Users/Erin/Anaconda3")
cnlp_init_spacy()
#---get basic data
body <- read.csv("./Data/bodyparts.csv")
files <- read.csv('PotentialBooks.csv')
lim <- 900000
final <- NULL
for (i in 1:nrow(files)) {
mastertext <- readLines(files$Link[i]) %>% paste(collapse = " ") #readLines('test.txt') %>% paste(collapse = " ") #t
#---spacy can only process 900k characters at once, or thereabouts
loops <- ceiling(nchar(mastertext)/lim)
for (j in 1:loops) {
#body <- read.csv("./Data/bodyparts.csv")
text <- substr(mastertext, (j-1)*lim+1, j*lim) #readLines('felurian.txt') %>% paste(collapse = " ") #
#------annotate. this will take a bit
obj <- cnlp_annotate(text)
#----------extract body parts
<BODY PART EXTRACTION CODE HERE>
#----------bind in results to final dataframe
book_results <- bind_rows(simpleposs, hadadj) %>% bind_rows(poss) %>% unique()
book_results$id <- files$ID[i]
final <- bind_rows(final, book_results)
}
}
<SKEW CALCULATIONS PROCEED AS NORMAL HERE>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment