Skip to content

Instantly share code, notes, and snippets.

@padpadpadpad
Last active September 17, 2021 16:04
Show Gist options
  • Save padpadpadpad/adb5734d16fcf55103e16a9f2a4ec490 to your computer and use it in GitHub Desktop.
Save padpadpadpad/adb5734d16fcf55103e16a9f2a4ec490 to your computer and use it in GitHub Desktop.
# look at gender balance of references
# load packages
library(tidyverse)
library(gender)
# load in dataset
# do some wrangling
d <- read.csv('path_to_refs/reference_list.csv', header = FALSE) %>%
# rename columns
rename(reference = V1, authors = V2) %>%
# grab reference number, change year to !, replace everything after first ! to '', replace ' and' to ''
mutate(reference = parse_number(reference),
authors = gsub('[0-9]', '!', authors),
authors = gsub('!.*', '', authors),
authors = gsub(' and', ',', authors)) %>%
# stack all authors from the same reference
separate_rows(., authors, sep = ',') %>%
# renmove empty rows
filter(., authors != '') %>%
group_by(reference) %>%
# add loose naming strategies for first and last author, remove white space from names
mutate(order = 1:n(),
author_info = case_when(order == 1 ~ 'first_author',
order == max(order) ~ 'last_author',
TRUE ~ 'co_author'),
authors = trimws(authors)) %>%
ungroup()
# grab first name
d <- mutate(d, first_name = gsub('\\ .*', '', authors))
# drop samples with only initial as first name
d2 <- filter(d, !str_detect(first_name, '\\.'))
# predict gender based on first name
d2 <- group_by(d2, reference, order, author_info) %>%
nest() %>%
mutate(gender_predict = map(data, ~gender(.x$first_name, method = 'genderize')))
# get predictions
d2 <- d2 %>% unnest(c(data, gender_predict))
# remove NAs where the prediction has not worked
d3 <- filter(d2, !is.na(gender)) %>%
ungroup()
# % last authors that are women
last_authors <- filter(d3, author_info == 'last_author') %>%
mutate(refs = n()) %>%
group_by(gender) %>%
summarise(prop = n()/unique(refs), .groups = 'drop')
# % first authors that are women
first_authors <- filter(d3, author_info == 'first_author') %>%
mutate(refs = n()) %>%
group_by(gender) %>%
summarise(prop = n()/unique(refs), .groups = 'drop')
# % all authors that are women
all_authors <- mutate(d3, n_authors = n()) %>%
group_by(gender) %>%
summarise(prop = n()/unique(n_authors), .groups = 'drop')
# % references with a female/male author somewhere
papers_with_female_male <- mutate(d3, refs = length(unique(reference))) %>%
select(reference, gender, refs) %>%
distinct() %>%
group_by(gender) %>%
summarise(prop = n()/unique(refs))
# create output dataframe
output <- tibble(application = 'nerc_irf_2021',
percent_refs_with_female = papers_with_female_male$prop[1] * 100,
percent_refs_with_male = papers_with_female_male$prop[2] * 100,
first_author_female = first_authors$prop[1] * 100,
last_author_female = last_authors$prop[1] * 100,
all_authors_female = all_authors$prop[1] * 100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment