Skip to content

Instantly share code, notes, and snippets.

@farach
Last active October 17, 2021 21:12
Show Gist options
  • Save farach/c241d6d5aafec5735b848e1abac5b99c to your computer and use it in GitHub Desktop.
Save farach/c241d6d5aafec5735b848e1abac5b99c to your computer and use it in GitHub Desktop.
Web scraping glassdoor review, using spacy for NLP, plotting evolution of reviews
---
title: "Using SpacyR on scraped glassdoor reviews"
author: "Alex Farach"
date: "10/17/2021"
output: html_document
---
```{r setup, include=FALSE}
library(knitr)
knitr::opts_chunk$set(
cache = TRUE, cache.lazy = FALSE, warning = FALSE,
message = FALSE, echo = TRUE, dpi = 180, fig.width = 8,
fig.height = 5
)
library(tidyverse)
library(patchwork)
library(lubridate)
library(tidytext)
#reticulate::py_install('spaCy')
library(spacyr)
library(httr)
library(xml2)
library(rvest)
library(viridis)
library(ggExtra)
theme_afs <- function(font_family) {
theme_minimal()
theme(
panel.grid.minor = element_blank(),
panel.grid.major.y = element_line(
color = "#cbcbcb",
linetype = "dashed"
),
panel.grid.major.x = element_line(
color = "#cbcbcb",
linetype = "dashed"
),
panel.background = element_blank(),
strip.background = element_rect(color = "white", size = 3),
strip.text = element_text(
#hjust = 0,
color = "#4B636E",
size = 9
),
axis.ticks = element_blank(),
plot.title = element_text(
# family = font,
size = 20,
face = "bold",
hjust = 0.5,
vjust = 2,
color = "#6a1c91"
),
plot.subtitle = element_text(
# family = font,
size = 14,
color = "#6a1c91",
hjust = 0.5
),
plot.caption = element_text(
# family = font,
size = 9,
hjust = 1,
color = "#6a1c91"
),
axis.title = element_text(
# family = font,
size = 10,
color = "#4B636E"
),
axis.text = element_text(
# family = font,
size = 9,
color = "#4B636E"
),
axis.text.x = element_text(
margin = margin(5, b = 10)
),
legend.text.align = 0,
legend.background = element_blank(),
legend.title = element_text(
# family = font,
size = 8,
color = "#4B636E"
),
legend.key = element_blank(),
legend.text = element_text(
# family = font,
size = 8,
color = "#4B636E"
),
plot.margin = unit(c(1, 1, 1.5, 1),"cm"),
axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0),
angle = 90)
)
}
theme_set(theme_afs())
```
# Scrape
```{r}
baseurl <- "https://www.glassdoor.com/Reviews/Company-Reviews-"
companyNum <- "E4138"
sort <- ".htm?sort.sortType=RD&sort.ascending=false"
maxResults <- 500
# rating
rating <- map_df(1:maxResults, function(i) {
cat(" P", i, sep = "")
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = "")))
pg %>%
html_nodes("#ReviewsFeed .mr-xsm") %>%
html_text() %>%
data.frame(rev.sum = ., stringsAsFactors = F)
})
# 2. Summary
summary <- map_df(1:maxResults, function(i) {
cat(" P", i, sep = "")
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = "")))
pg %>%
html_nodes(".reviewLink") %>%
html_text() %>%
data.frame(rev.sum = ., stringsAsFactors = F)
})
# 4. Title
title <- map_df(1:maxResults, function(i) {
cat(" P", i, sep = "")
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = "")))
pg %>%
html_nodes("span.authorInfo") %>%
html_text() %>%
data.frame(rev.title = ., stringsAsFactors = F)
})
# 5. Pros *************
pros <- map_df(1:maxResults, function(i) {
cat(" P", i, sep = "")
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = "")))
pg %>%
html_nodes(".v2__EIReviewDetailsV2__fullWidth:nth-child(1) span") %>%
html_text() %>%
data.frame(rev.pros = ., stringsAsFactors = F)
})
# 6. Cons *********************
cons <- map_df(1:maxResults, function(i) {
cat(" P", i, sep = "")
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = "")))
pg %>%
html_nodes(".v2__EIReviewDetailsV2__fullWidth+ .v2__EIReviewDetailsV2__fullWidth span") %>%
html_text() %>%
data.frame(rev.cons = ., stringsAsFactors = F)
})
# 7. Helpful
helpful <- map_df(1:maxResults, function(i) {
cat(" P", i, sep = "")
pg <- read_html(GET(paste(baseurl, companyNum, "_P", i, sort, sep = "")))
pg %>%
html_nodes(".common__EiReviewDetailsStyle__socialHelpfulcontainer") %>%
html_text() %>%
data.frame(rev.helpf = ., stringsAsFactors = F)
})
gd <- data.frame(
rating,
summary,
title,
pros,
cons,
helpful
)
# Export to csv
write.csv(gd, "./gd_results.csv", row.names = F)
```
```{r}
gd_clean <- gd %>%
janitor::clean_names() %>%
mutate(
rev_date = mdy(str_sub(rev_title, 1, 12)),
rev_city = str_split(rev_title, "in ", simplify = TRUE)[,2],
rev_state = str_sub(rev_city, -2),
rev_city = str_sub(rev_city, 1, -5),
rev_title = str_split(str_sub(rev_title, 15), "in ", simplify = TRUE)[,1],
rev_helpf = as.numeric(str_extract_all(rev_helpf, "[:digit:]", simplify = TRUE)[, 1]),
rev_rating = rev_sum,
rev_sum = rev_sum_1
) %>%
unite("rev_full", rev_sum:rev_cons, remove = FALSE, sep = " ") %>%
mutate_if(is.numeric, ~ replace_na(., 0)) %>%
mutate_if(is.character, ~ str_trim(., "both")) %>%
mutate_if(is.character, ~ tolower(.)) %>%
mutate_if(is.character, ~ str_remove_all(., "[:punct:]")) %>%
select(rev_date, rev_state, rev_city, rev_title, rev_rating, everything())
```
# spaCy
```{r}
gd_spacy_pro <- gd_clean %>%
pull(rev_pros)
gd_spacy_con <- gd_clean %>%
pull(rev_cons)
# Install spacy
#spacy_install()
# initialize the spaCy library
spacy_initialize(model = "en_core_web_sm")
gd_parsed_pro <- spacy_parse(gd_spacy_pro) %>%
mutate(pro_con = "pro")
gd_parsed_con <- spacy_parse(gd_spacy_con) %>%
mutate(pro_con = "con")
```
# Plot
```{r}
gd_clean_parsed_pro <- gd_clean %>%
mutate(
doc_id = paste0("text", row_number())
) %>%
left_join(gd_parsed_pro)
gd_clean_parsed_con <- gd_clean %>%
mutate(
doc_id = paste0("text", row_number())
) %>%
left_join(gd_parsed_con)
gd_plot1_pro <- gd_clean_parsed_pro %>%
mutate(
rev_rating = as.factor(rev_rating)
) %>%
filter(pos == "NOUN") %>%
group_by(pro_con, rev_rating) %>%
count(token, sort = TRUE) %>%
group_by(rev_rating) %>%
slice_max(order_by = n, n = 10) %>%
ungroup()
gd_plot1_con <- gd_clean_parsed_con %>%
mutate(
rev_rating = as.factor(rev_rating)
) %>%
filter(pos == "NOUN") %>%
group_by(pro_con, rev_rating) %>%
count(token, sort = TRUE) %>%
ungroup() %>%
group_by(rev_rating) %>%
slice_max(order_by = n, n = 10) %>%
ungroup()
gd_plot1 <- bind_rows(gd_plot1_pro, gd_plot1_con) %>%
group_by(rev_rating) %>%
mutate(
sum_n = sum(n)
) %>%
ungroup() %>%
mutate(n = ifelse(pro_con == "con", -n, n))
gd_plot1
```
```{r}
(pp1 <- gd_plot1 %>%
mutate(
token = tidytext::reorder_within(
x = token, by = n, within = list(rev_rating, pro_con)
),
pro_con = factor(
ifelse(pro_con == "pro", "Pros", "Cons"),
levels = c("Pros", "Cons")
),
rev_rating = paste0(
"Rating category: ",
rev_rating,
"\nReview count: ",
scales::comma(sum_n)
)
) %>%
ggplot(aes(token, n, fill = pro_con)) +
geom_col(alpha = 0.75) +
facet_wrap(~ rev_rating, scales = "free") +
coord_flip() +
tidytext::scale_x_reordered() +
scale_fill_viridis(discrete = T) +
labs(
title = "Top 10 most common nouns by rating category",
x = NULL,
y = NULL,
fill = "Review category - "
) +
theme_afs() +
theme(
plot.title = element_text(
size = 10,
face = "bold",
hjust = 0,
vjust = 2,
color = "#6a1c91"
),
plot.title.position = "plot",
legend.position = c(0.9, 0.3),
legend.justification = c(1, 1)
)
)
```
```{r}
(
pp2 <- gd_clean %>%
mutate(
year = factor(lubridate::year(rev_date),
levels = c("2016", "2017", "2018", "2019", "2020", "2021")
)
) %>%
group_by(rev_rating) %>%
count(year) %>%
group_by(year) %>%
mutate(
sum = sum(n),
year = paste0(
year,
"\n",
scales::comma(unique(sum))
)
) %>%
ungroup() %>%
ggplot(aes(rev_rating, n, fill = year)) +
geom_col(
position = position_fill(reverse = TRUE)
) +
scale_y_continuous(labels = scales::percent) +
scale_fill_viridis(discrete = T) +
theme(
legend.position = "bottom",
legend.background = element_rect(fill = "white", colour = "black")
) +
labs(
title = "Rating frequency by year",
y = "Frequency",
x = "Rating category",
fill = "Year - \nTotal Reviews - "
) +
coord_flip() +
guides(fill = guide_legend(nrow = 1)) +
theme_afs() +
theme(
plot.title = element_text(
size = 10,
face = "bold",
hjust = 0,
vjust = 2,
color = "#6a1c91"
),
plot.title.position = "plot",
legend.title = element_text(
size = 9
),
legend.text = element_text(
size = 9.5
)
)
)
```
```{r}
(
pp3 <- gd_clean %>%
mutate(
year = year(rev_date),
month = month(rev_date, label = TRUE),
day = day(rev_date),
year_month = floor_date(rev_date, "month"),
rev_state = fct_lump(as.factor(rev_state), 10)
) %>%
group_by(year_month, rev_rating) %>%
summarise(
n = n()) %>%
pivot_wider(names_from = rev_rating, values_from = n, values_fill = 0) %>%
pivot_longer(cols = `1`:`5`, names_to = "rev_rating", values_to = "n") %>%
group_by(year_month) %>%
mutate(
percentage = n / sum(n),
) %>%
ungroup() %>%
ggplot(aes(year_month, percentage, fill = as.factor(rev_rating))) +
geom_area(alpha = 0.75 , size=.5, colour="white") +
scale_fill_viridis(discrete = T) +
scale_y_continuous(labels = scales::percent) +
guides(fill = guide_legend(nrow = 1)) +
theme_afs() +
theme(
plot.title = element_text(
size = 10,
face = "bold",
hjust = 0,
vjust = 2,
color = "#6a1c91",
margin = margin(0,0,30,0)
),
plot.title.position = "plot",
legend.position = "bottom",
legend.title = element_text(
size = 9
),
legend.text = element_text(
size = 9.5
)
) +
labs(
title = "Rating category precentage over time",
y = "Percentage",
x = "Date",
fill = "Rating category -"
)
)
```
```{r}
pp1 + pp2 / pp3 +
plot_annotation(
title = "Glassdoor reviews: Accenture",
caption = "Source:\nGlassdoor.com",
theme = theme(
plot.title = element_text(
size = 20,
face = "bold",
hjust = 0.5,
vjust = 2,
color = "#6a1c91"
),
plot.caption = element_text(
size = 9,
hjust = 1,
color = "#6a1c91"
)
)
)
```
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment