Last active
September 17, 2021 16:04
-
-
Save padpadpadpad/adb5734d16fcf55103e16a9f2a4ec490 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# look at gender balance of references | |
# load packages | |
library(tidyverse) | |
library(gender) | |
# load in dataset | |
# do some wrangling | |
d <- read.csv('path_to_refs/reference_list.csv', header = FALSE) %>% | |
# rename columns | |
rename(reference = V1, authors = V2) %>% | |
# grab reference number, change year to !, replace everything after first ! to '', replace ' and' to '' | |
mutate(reference = parse_number(reference), | |
authors = gsub('[0-9]', '!', authors), | |
authors = gsub('!.*', '', authors), | |
authors = gsub(' and', ',', authors)) %>% | |
# stack all authors from the same reference | |
separate_rows(., authors, sep = ',') %>% | |
# renmove empty rows | |
filter(., authors != '') %>% | |
group_by(reference) %>% | |
# add loose naming strategies for first and last author, remove white space from names | |
mutate(order = 1:n(), | |
author_info = case_when(order == 1 ~ 'first_author', | |
order == max(order) ~ 'last_author', | |
TRUE ~ 'co_author'), | |
authors = trimws(authors)) %>% | |
ungroup() | |
# grab first name | |
d <- mutate(d, first_name = gsub('\\ .*', '', authors)) | |
# drop samples with only initial as first name | |
d2 <- filter(d, !str_detect(first_name, '\\.')) | |
# predict gender based on first name | |
d2 <- group_by(d2, reference, order, author_info) %>% | |
nest() %>% | |
mutate(gender_predict = map(data, ~gender(.x$first_name, method = 'genderize'))) | |
# get predictions | |
d2 <- d2 %>% unnest(c(data, gender_predict)) | |
# remove NAs where the prediction has not worked | |
d3 <- filter(d2, !is.na(gender)) %>% | |
ungroup() | |
# % last authors that are women | |
last_authors <- filter(d3, author_info == 'last_author') %>% | |
mutate(refs = n()) %>% | |
group_by(gender) %>% | |
summarise(prop = n()/unique(refs), .groups = 'drop') | |
# % first authors that are women | |
first_authors <- filter(d3, author_info == 'first_author') %>% | |
mutate(refs = n()) %>% | |
group_by(gender) %>% | |
summarise(prop = n()/unique(refs), .groups = 'drop') | |
# % all authors that are women | |
all_authors <- mutate(d3, n_authors = n()) %>% | |
group_by(gender) %>% | |
summarise(prop = n()/unique(n_authors), .groups = 'drop') | |
# % references with a female/male author somewhere | |
papers_with_female_male <- mutate(d3, refs = length(unique(reference))) %>% | |
select(reference, gender, refs) %>% | |
distinct() %>% | |
group_by(gender) %>% | |
summarise(prop = n()/unique(refs)) | |
# create output dataframe | |
output <- tibble(application = 'nerc_irf_2021', | |
percent_refs_with_female = papers_with_female_male$prop[1] * 100, | |
percent_refs_with_male = papers_with_female_male$prop[2] * 100, | |
first_author_female = first_authors$prop[1] * 100, | |
last_author_female = last_authors$prop[1] * 100, | |
all_authors_female = all_authors$prop[1] * 100) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment