Last active
October 17, 2021 21:12
-
-
Save farach/c241d6d5aafec5735b848e1abac5b99c to your computer and use it in GitHub Desktop.
Web scraping glassdoor review, using spacy for NLP, plotting evolution of reviews
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: "Using SpacyR on scraped glassdoor reviews" | |
author: "Alex Farach" | |
date: "10/17/2021" | |
output: html_document | |
--- | |
```{r setup, include=FALSE} | |
library(knitr) | |
knitr::opts_chunk$set( | |
cache = TRUE, cache.lazy = FALSE, warning = FALSE, | |
message = FALSE, echo = TRUE, dpi = 180, fig.width = 8, | |
fig.height = 5 | |
) | |
library(tidyverse) | |
library(patchwork) | |
library(lubridate) | |
library(tidytext) | |
#reticulate::py_install('spaCy') | |
library(spacyr) | |
library(httr) | |
library(xml2) | |
library(rvest) | |
library(viridis) | |
library(ggExtra) | |
theme_afs <- function(font_family) { | |
theme_minimal() | |
theme( | |
panel.grid.minor = element_blank(), | |
panel.grid.major.y = element_line( | |
color = "#cbcbcb", | |
linetype = "dashed" | |
), | |
panel.grid.major.x = element_line( | |
color = "#cbcbcb", | |
linetype = "dashed" | |
), | |
panel.background = element_blank(), | |
strip.background = element_rect(color = "white", size = 3), | |
strip.text = element_text( | |
#hjust = 0, | |
color = "#4B636E", | |
size = 9 | |
), | |
axis.ticks = element_blank(), | |
plot.title = element_text( | |
# family = font, | |
size = 20, | |
face = "bold", | |
hjust = 0.5, | |
vjust = 2, | |
color = "#6a1c91" | |
), | |
plot.subtitle = element_text( | |
# family = font, | |
size = 14, | |
color = "#6a1c91", | |
hjust = 0.5 | |
), | |
plot.caption = element_text( | |
# family = font, | |
size = 9, | |
hjust = 1, | |
color = "#6a1c91" | |
), | |
axis.title = element_text( | |
# family = font, | |
size = 10, | |
color = "#4B636E" | |
), | |
axis.text = element_text( | |
# family = font, | |
size = 9, | |
color = "#4B636E" | |
), | |
axis.text.x = element_text( | |
margin = margin(5, b = 10) | |
), | |
legend.text.align = 0, | |
legend.background = element_blank(), | |
legend.title = element_text( | |
# family = font, | |
size = 8, | |
color = "#4B636E" | |
), | |
legend.key = element_blank(), | |
legend.text = element_text( | |
# family = font, | |
size = 8, | |
color = "#4B636E" | |
), | |
plot.margin = unit(c(1, 1, 1.5, 1),"cm"), | |
axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), | |
angle = 90) | |
) | |
} | |
theme_set(theme_afs()) | |
``` | |
# Scrape | |
```{r} | |
baseurl <- "https://www.glassdoor.com/Reviews/Company-Reviews-" | |
companyNum <- "E4138" | |
sort <- ".htm?sort.sortType=RD&sort.ascending=false" | |
maxResults <- 500 | |
# rating | |
rating <- map_df(1:maxResults, function(i) { | |
cat(" P", i, sep = "") | |
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = ""))) | |
pg %>% | |
html_nodes("#ReviewsFeed .mr-xsm") %>% | |
html_text() %>% | |
data.frame(rev.sum = ., stringsAsFactors = F) | |
}) | |
# 2. Summary | |
summary <- map_df(1:maxResults, function(i) { | |
cat(" P", i, sep = "") | |
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = ""))) | |
pg %>% | |
html_nodes(".reviewLink") %>% | |
html_text() %>% | |
data.frame(rev.sum = ., stringsAsFactors = F) | |
}) | |
# 4. Title | |
title <- map_df(1:maxResults, function(i) { | |
cat(" P", i, sep = "") | |
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = ""))) | |
pg %>% | |
html_nodes("span.authorInfo") %>% | |
html_text() %>% | |
data.frame(rev.title = ., stringsAsFactors = F) | |
}) | |
# 5. Pros ************* | |
pros <- map_df(1:maxResults, function(i) { | |
cat(" P", i, sep = "") | |
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = ""))) | |
pg %>% | |
html_nodes(".v2__EIReviewDetailsV2__fullWidth:nth-child(1) span") %>% | |
html_text() %>% | |
data.frame(rev.pros = ., stringsAsFactors = F) | |
}) | |
# 6. Cons ********************* | |
cons <- map_df(1:maxResults, function(i) { | |
cat(" P", i, sep = "") | |
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = ""))) | |
pg %>% | |
html_nodes(".v2__EIReviewDetailsV2__fullWidth+ .v2__EIReviewDetailsV2__fullWidth span") %>% | |
html_text() %>% | |
data.frame(rev.cons = ., stringsAsFactors = F) | |
}) | |
# 7. Helpful | |
helpful <- map_df(1:maxResults, function(i) { | |
cat(" P", i, sep = "") | |
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = ""))) | |
pg %>% | |
html_nodes(".common__EiReviewDetailsStyle__socialHelpfulcontainer") %>% | |
html_text() %>% | |
data.frame(rev.helpf = ., stringsAsFactors = F) | |
}) | |
gd <- data.frame( | |
rating, | |
summary, | |
title, | |
pros, | |
cons, | |
helpful | |
) | |
# Export to csv | |
write.csv(gd, "./gd_results.csv", row.names = F) | |
``` | |
```{r} | |
gd_clean <- gd %>% | |
janitor::clean_names() %>% | |
mutate( | |
rev_date = mdy(str_sub(rev_title, 1, 12)), | |
rev_city = str_split(rev_title, "in ", simplify = TRUE)[,2], | |
rev_state = str_sub(rev_city, -2), | |
rev_city = str_sub(rev_city, 1, -5), | |
rev_title = str_split(str_sub(rev_title, 15), "in ", simplify = TRUE)[,1], | |
rev_helpf = as.numeric(str_extract_all(rev_helpf, "[:digit:]", simplify = TRUE)[, 1]), | |
rev_rating = rev_sum, | |
rev_sum = rev_sum_1 | |
) %>% | |
unite("rev_full", rev_sum:rev_cons, remove = FALSE, sep = " ") %>% | |
mutate_if(is.numeric, ~ replace_na(., 0)) %>% | |
mutate_if(is.character, ~ str_trim(., "both")) %>% | |
mutate_if(is.character, ~ tolower(.)) %>% | |
mutate_if(is.character, ~ str_remove_all(., "[:punct:]")) %>% | |
select(rev_date, rev_state, rev_city, rev_title, rev_rating, everything()) | |
``` | |
# spaCy | |
```{r} | |
gd_spacy_pro <- gd_clean %>% | |
pull(rev_pros) | |
gd_spacy_con <- gd_clean %>% | |
pull(rev_cons) | |
# Install spacy | |
#spacy_install() | |
# initialize the spaCy library | |
spacy_initialize(model = "en_core_web_sm") | |
gd_parsed_pro <- spacy_parse(gd_spacy_pro) %>% | |
mutate(pro_con = "pro") | |
gd_parsed_con <- spacy_parse(gd_spacy_con) %>% | |
mutate(pro_con = "con") | |
``` | |
# Plot | |
```{r} | |
gd_clean_parsed_pro <- gd_clean %>% | |
mutate( | |
doc_id = paste0("text", row_number()) | |
) %>% | |
left_join(gd_parsed_pro) | |
gd_clean_parsed_con <- gd_clean %>% | |
mutate( | |
doc_id = paste0("text", row_number()) | |
) %>% | |
left_join(gd_parsed_con) | |
gd_plot1_pro <- gd_clean_parsed_pro %>% | |
mutate( | |
rev_rating = as.factor(rev_rating) | |
) %>% | |
filter(pos == "NOUN") %>% | |
group_by(pro_con, rev_rating) %>% | |
count(token, sort = TRUE) %>% | |
group_by(rev_rating) %>% | |
slice_max(order_by = n, n = 10) %>% | |
ungroup() | |
gd_plot1_con <- gd_clean_parsed_con %>% | |
mutate( | |
rev_rating = as.factor(rev_rating) | |
) %>% | |
filter(pos == "NOUN") %>% | |
group_by(pro_con, rev_rating) %>% | |
count(token, sort = TRUE) %>% | |
ungroup() %>% | |
group_by(rev_rating) %>% | |
slice_max(order_by = n, n = 10) %>% | |
ungroup() | |
gd_plot1 <- bind_rows(gd_plot1_pro, gd_plot1_con) %>% | |
group_by(rev_rating) %>% | |
mutate( | |
sum_n = sum(n) | |
) %>% | |
ungroup() %>% | |
mutate(n = ifelse(pro_con == "con", -n, n)) | |
gd_plot1 | |
``` | |
```{r} | |
(pp1 <- gd_plot1 %>% | |
mutate( | |
token = tidytext::reorder_within( | |
x = token, by = n, within = list(rev_rating, pro_con) | |
), | |
pro_con = factor( | |
ifelse(pro_con == "pro", "Pros", "Cons"), | |
levels = c("Pros", "Cons") | |
), | |
rev_rating = paste0( | |
"Rating category: ", | |
rev_rating, | |
"\nReview count: ", | |
scales::comma(sum_n) | |
) | |
) %>% | |
ggplot(aes(token, n, fill = pro_con)) + | |
geom_col(alpha = 0.75) + | |
facet_wrap(~ rev_rating, scales = "free") + | |
coord_flip() + | |
tidytext::scale_x_reordered() + | |
scale_fill_viridis(discrete = T) + | |
labs( | |
title = "Top 10 most common nouns by rating category", | |
x = NULL, | |
y = NULL, | |
fill = "Review category - " | |
) + | |
theme_afs() + | |
theme( | |
plot.title = element_text( | |
size = 10, | |
face = "bold", | |
hjust = 0, | |
vjust = 2, | |
color = "#6a1c91" | |
), | |
plot.title.position = "plot", | |
legend.position = c(0.9, 0.3), | |
legend.justification = c(1, 1) | |
) | |
) | |
``` | |
```{r} | |
( | |
pp2 <- gd_clean %>% | |
mutate( | |
year = factor(lubridate::year(rev_date), | |
levels = c("2016", "2017", "2018", "2019", "2020", "2021") | |
) | |
) %>% | |
group_by(rev_rating) %>% | |
count(year) %>% | |
group_by(year) %>% | |
mutate( | |
sum = sum(n), | |
year = paste0( | |
year, | |
"\n", | |
scales::comma(unique(sum)) | |
) | |
) %>% | |
ungroup() %>% | |
ggplot(aes(rev_rating, n, fill = year)) + | |
geom_col( | |
position = position_fill(reverse = TRUE) | |
) + | |
scale_y_continuous(labels = scales::percent) + | |
scale_fill_viridis(discrete = T) + | |
theme( | |
legend.position = "bottom", | |
legend.background = element_rect(fill = "white", colour = "black") | |
) + | |
labs( | |
title = "Rating frequency by year", | |
y = "Frequency", | |
x = "Rating category", | |
fill = "Year - \nTotal Reviews - " | |
) + | |
coord_flip() + | |
guides(fill = guide_legend(nrow = 1)) + | |
theme_afs() + | |
theme( | |
plot.title = element_text( | |
size = 10, | |
face = "bold", | |
hjust = 0, | |
vjust = 2, | |
color = "#6a1c91" | |
), | |
plot.title.position = "plot", | |
legend.title = element_text( | |
size = 9 | |
), | |
legend.text = element_text( | |
size = 9.5 | |
) | |
) | |
) | |
``` | |
```{r} | |
( | |
pp3 <- gd_clean %>% | |
mutate( | |
year = year(rev_date), | |
month = month(rev_date, label = TRUE), | |
day = day(rev_date), | |
year_month = floor_date(rev_date, "month"), | |
rev_state = fct_lump(as.factor(rev_state), 10) | |
) %>% | |
group_by(year_month, rev_rating) %>% | |
summarise( | |
n = n()) %>% | |
pivot_wider(names_from = rev_rating, values_from = n, values_fill = 0) %>% | |
pivot_longer(cols = `1`:`5`, names_to = "rev_rating", values_to = "n") %>% | |
group_by(year_month) %>% | |
mutate( | |
percentage = n / sum(n), | |
) %>% | |
ungroup() %>% | |
ggplot(aes(year_month, percentage, fill = as.factor(rev_rating))) + | |
geom_area(alpha = 0.75 , size=.5, colour="white") + | |
scale_fill_viridis(discrete = T) + | |
scale_y_continuous(labels = scales::percent) + | |
guides(fill = guide_legend(nrow = 1)) + | |
theme_afs() + | |
theme( | |
plot.title = element_text( | |
size = 10, | |
face = "bold", | |
hjust = 0, | |
vjust = 2, | |
color = "#6a1c91", | |
margin = margin(0,0,30,0) | |
), | |
plot.title.position = "plot", | |
legend.position = "bottom", | |
legend.title = element_text( | |
size = 9 | |
), | |
legend.text = element_text( | |
size = 9.5 | |
) | |
) + | |
labs( | |
title = "Rating category precentage over time", | |
y = "Percentage", | |
x = "Date", | |
fill = "Rating category -" | |
) | |
) | |
``` | |
```{r} | |
pp1 + pp2 / pp3 + | |
plot_annotation( | |
title = "Glassdoor reviews: Accenture", | |
caption = "Source:\nGlassdoor.com", | |
theme = theme( | |
plot.title = element_text( | |
size = 20, | |
face = "bold", | |
hjust = 0.5, | |
vjust = 2, | |
color = "#6a1c91" | |
), | |
plot.caption = element_text( | |
size = 9, | |
hjust = 1, | |
color = "#6a1c91" | |
) | |
) | |
) | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment