Last active
February 21, 2018 19:19
-
-
Save imjakedaniels/5cf6e81c332cf0ad5afe3cc9dc643b10 to your computer and use it in GitHub Desktop.
[R Studio]: Star Wars vs Star Trek movie scripts - Cleaning Unstructured Data, Tokenizing, Joins, GGPlot2, dplyr
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "StarWars" | |
output: html_document | |
--- | |
```{r } | |
#https://kkulma.github.io/2017-12-16-star-wars-vs-star-trek-word-battle/ | |
``` | |
```{r} | |
#packages | |
install.packages("rvest") | |
library(rvest) | |
install.packages("dplyr") | |
library(dplyr) | |
install.packages("tm") | |
library(tm) | |
install.packages("tidytext") | |
library(tidytext) | |
install.packages("ggthemes") | |
library(ggthemes) | |
install.packages("ggplot2") | |
library(ggplot2) | |
install.packages("DT") | |
library(DT) | |
``` | |
```{r} | |
#find your scripts and read html | |
swIV_url <-"http://www.imsdb.com/scripts/Star-Wars-A-New-Hope.html" | |
startrek_url <- "http://www.dailyscript.com/scripts/startrek01.html" | |
star_wars <- read_html(swIV_url) %>% | |
html_nodes("td") %>% | |
html_text() %>% | |
.[[88]] | |
star_trek <- read_html(startrek_url) %>% | |
html_nodes("pre") %>% | |
html_text() | |
``` | |
```{r} | |
#clean | |
clean_text <- function(x) { | |
#specific errors in the text conversion | |
x <- gsub(pattern="\\\n", replacement=" ", x) | |
x <- gsub(pattern="\\\r", replacement=" ", x) | |
x <- gsub(pattern="\\\t", replacement=" ", x) | |
#remove punctuation | |
x <- gsub(pattern="[[:punct:]]", replacement=" ", x) | |
#basic R "code"tolower()" to remove capital letters, and two tm package codes that remove numbers and empty space all piped into x. | |
x <- x %>% | |
tolower() %>% | |
removeNumbers() %>% | |
stripWhitespace() | |
x} | |
clean_star_trek <- clean_text(star_trek) | |
clean_star_wars <- clean_text(star_wars) | |
``` | |
```{r} | |
#tokenization SW | |
sw_tokens <- clean_star_wars %>% | |
as_tibble() %>% | |
#naming a column | |
rename_(sw_text = names(.)[1]) %>% | |
mutate_if(is.factor, as.character) %>% | |
mutate(swt=unlist(sw_text)) %>% | |
unique() %>% | |
#Split a column into tokens using the tokenizers package | |
unnest_tokens("word", sw_text) %>% | |
#new kind of join I learned. It "filters" two tupples rather than "mutates" (ex. inner join, full join) Try ?anti_join | |
anti_join(stop_words) %>% | |
count(word, sort = TRUE) %>% | |
rename(sw_n = n) | |
sw_tokens | |
``` | |
```{r} | |
#tokenization ST | |
st_tokens <- clean_star_trek %>% | |
as_tibble() %>% | |
rename_(st_text = names(.)[1]) %>% | |
mutate_if(is.factor, as.character) %>% | |
mutate(stt=unlist(st_text)) %>% | |
unique() %>% | |
unnest_tokens("word", st_text) %>% | |
anti_join(stop_words) %>% | |
count(word, sort = TRUE) %>% | |
rename(st_n = n) | |
st_tokens | |
``` | |
```{r} | |
#Combining the similarities and removing 0s via inner join | |
final_tokens = sw_tokens %>% | |
inner_join(st_tokens) %>% | |
mutate(log_word_prop = round(log(sw_n / st_n),3), | |
dominates_in = as.factor(ifelse(log_word_prop > 0, "star_wars", "star_trek"))) | |
#Re: log_word_prop: "To compare the frequencies I’ll use a variable called word_prop: logarithm of the proportion between Star Wars and Star Trek word frequency. This means that the more positive the value, the more drastic the difference in frequency it is in favour of Star Wars. On the other hand, the more negative the value, the more commonly it was used in Star Trek (in comparison to Star Wars)." | |
final_tokens | |
``` | |
```{r} | |
#plotting | |
set.seed(13) | |
final_tokens %>% | |
filter(abs(log_word_prop) == 0) %>% | |
arrange(desc(sw_n)) %>% | |
sample_n(30) %>% | |
ggplot(aes(x = reorder(word, log_word_prop), y = log_word_prop, fill = dominates_in)) + | |
geom_bar(stat = "identity", show.legend = FALSE) + | |
theme_minimal() + | |
coord_flip() + | |
xlab("") + | |
ylab("log(word_prop)") + | |
scale_fill_brewer(palette = "Set1") + | |
ggtitle("Sample of words that occur with the same frequency in SW and ST") | |
#remove the irrelevant words, arrange by strongest correlations, might take some time | |
``` | |
```{r} | |
final_tokens %>% | |
filter(abs(log_word_prop) > 2.4) %>% | |
ggplot(aes(x = reorder(word, log_word_prop), y = log_word_prop, fill = dominates_in)) + | |
geom_bar(stat = "identity") + | |
theme_minimal() + | |
coord_flip() + | |
xlab("") + | |
ylab("log(word_prop)") + | |
scale_fill_brewer(palette = "Set1") + | |
ggtitle("Words that show strikingly different frequencies in Star Wars and Star Trek") | |
``` |
Author
imjakedaniels
commented
Jan 27, 2018
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment