Created
February 16, 2018 16:12
-
-
Save earino/9d15c361d0c3b5523989463c02d80a3e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From the blog post on the Weinstein Effect | |
# https://www.gokhanciflikli.com/post/weinstein-effect/ | |
library(GuardianR) | |
library(stringr) | |
library(tidyverse) | |
library(tidytext) | |
library(lubridate) | |
library(rvest) | |
library(ggplot2) | |
library(ggrepel) | |
library(scales) | |
Sys.setenv(TZ='Europe/Budapest') #IMPORTANT | |
todays_date <- as.character(as.Date(now())) | |
start_date <- as.character(as.Date(now() - years(5))) | |
guardian_access_key <- Sys.getenv("GUARDIAN_ACCESS_KEY") | |
if (! file.exists("articles.csv")) { | |
articles <- get_guardian(keywords = "sexual+harassment", | |
section = "world", | |
from.date = start_date, | |
to.date = todays_date, | |
api.key = guardian_access_key) | |
write_csv(articles, "articles.csv") | |
} | |
fix_apos <- c("hasn", "hadn", "doesn", "didn", "isn", "wasn", "couldn", "wouldn") | |
articles <- read_csv("articles.csv") %>% | |
select(webPublicationDate, body) %>% | |
mutate(body = iconv(body, "", "ASCII", "byte")) %>% | |
mutate(body = gsub("<.*?>", "", body)) %>% | |
mutate(before = ifelse(webPublicationDate < "2017-10-05", TRUE, FALSE)) %>% | |
unnest_tokens(bigram, body, token = "ngrams", n = 2) %>% | |
separate(bigram, c("word1", "word2"), remove=FALSE, sep = " ") %>% | |
filter(word1 %in% c("he", "she")) %>% | |
mutate(word2 = ifelse(word2 %in% fix_apos, str_c(word2, "t"), word2)) %>% | |
group_by(before) %>% | |
count(word1, word2) %>% | |
spread(word1, n, fill = 0) %>% | |
mutate(total = he + she, | |
he = (he + 1) / sum(he + 1), | |
she = (she + 1) / sum(she + 1), | |
log.ratio = log2(she / he), | |
abs.ratio = abs(log.ratio)) %>% | |
arrange(desc(log.ratio)) | |
articles %>% | |
filter(before == TRUE) %>% | |
filter(!word2 %in% c("himself", "herself", "ever", "quickly", | |
"actually", "sexually", "allegedly", "have"), | |
total >= 5) %>% | |
group_by(direction = ifelse(log.ratio > 0, 'More "she"', "More 'he'")) %>% | |
top_n(15, abs.ratio) %>% | |
ungroup() %>% | |
mutate(word2 = reorder(word2, log.ratio)) %>% | |
ggplot(aes(word2, log.ratio, fill = direction)) + | |
geom_col() + | |
coord_flip() + | |
labs(x = "", | |
y = 'Relative appearance after "she" compared to "he"', | |
fill = "", | |
title = "Pre Weinstein: 2012-17 The Guardian Articles on Sexual Harassment", | |
subtitle = "Top 15 Most Gendered (Skewed) Verbs after he/she; at least 5 occurrences.") + | |
scale_y_continuous(labels = c("8X", "6X", "4X", "2X", "Same", "2X", "4X", "6X", "8X"), | |
breaks = seq(-4, 4)) + | |
guides(fill = guide_legend(reverse = TRUE)) + | |
expand_limits(y = c(-4, 4)) | |
articles %>% | |
filter(before == TRUE) %>% | |
filter(!word2 %in% c("himself", "herself", "she", "too", "later", "apos", "just", "says"), | |
total >= 10) %>% | |
top_n(100, abs.ratio) %>% | |
ggplot(aes(total, log.ratio)) + | |
geom_point() + | |
geom_vline(xintercept = 5, color = "NA") + | |
geom_hline(yintercept = 0, color = "red") + | |
scale_x_log10(breaks = c(10, 100, 1000)) + | |
geom_text_repel(aes(label = word2), segment.alpha = .1, force = 2) + | |
scale_y_continuous(breaks = seq(-4, 4), | |
labels = c('8X "he"', '6X "he"', '4X "he"', '2X "he"', "Same", | |
'2X "she"', '4X "she"', '6X "she"', '8X "she"')) + | |
labs(x = 'Total uses after "he" or "she" (Logarithmic scale)', | |
y = 'Relative uses after "she" to after "he"', | |
title = "Gendered Reporting: Pre Weinstein, The Guardian", | |
subtitle = "Words occurring at least 10 times after he/she: | |
160 unique words (100 displayed) | 11,013 occurrences in total") + | |
expand_limits(y = c(4, -4)) | |
articles %>% | |
filter(before == FALSE) %>% | |
filter(!word2 %in% c("himself", "herself", "ever", "quickly", | |
"actually", "sexually", "allegedly", "have"), | |
total >= 5) %>% | |
group_by(direction = ifelse(log.ratio > 0, 'More "she"', "More 'he'")) %>% | |
top_n(15, abs.ratio) %>% | |
ungroup() %>% | |
mutate(word2 = reorder(word2, log.ratio)) %>% | |
ggplot(aes(word2, log.ratio, fill = direction)) + | |
geom_col() + | |
coord_flip() + | |
labs(x = "", | |
y = 'Relative appearance after "she" compared to "he"', | |
fill = "", | |
title = "Pre Weinstein: 2012-17 The Guardian Articles on Sexual Harassment", | |
subtitle = "Top 15 Most Gendered (Skewed) Verbs after he/she; at least 5 occurrences.") + | |
scale_y_continuous(labels = c("8X", "6X", "4X", "2X", "Same", "2X", "4X", "6X", "8X"), | |
breaks = seq(-4, 4)) + | |
guides(fill = guide_legend(reverse = TRUE)) + | |
expand_limits(y = c(-4, 4)) | |
articles %>% | |
filter(before == FALSE) %>% | |
filter(!word2 %in% c("himself", "herself", "she", "too", "later", "apos", "just", "says"), | |
total >= 2) %>% | |
top_n(100, abs.ratio) %>% | |
ggplot(aes(total, log.ratio)) + | |
geom_point() + | |
geom_vline(xintercept = 5, color = "NA") + | |
geom_hline(yintercept = 0, color = "red") + | |
scale_x_log10(breaks = c(10, 100, 1000)) + | |
geom_text_repel(aes(label = word2), segment.alpha = .1, force = 2) + | |
scale_y_continuous(breaks = seq(-4, 4), | |
labels = c('8X "he"', '6X "he"', '4X "he"', '2X "he"', "Same", | |
'2X "she"', '4X "she"', '6X "she"', '8X "she"')) + | |
labs(x = 'Total uses after "he" or "she" (Logarithmic scale)', | |
y = 'Relative uses after "she" to after "he"', | |
title = "Gendered Reporting: Pre Weinstein, The Guardian", | |
subtitle = "Words occurring at least 10 times after he/she: | |
160 unique words (100 displayed) | 11,013 occurrences in total") + | |
expand_limits(y = c(4, -4)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment