padpadpadpad · September 17, 2021 16:04
diff --git a/check_gender_balance.R b/check_gender_balance.R
 # look at gender balance of references

 # load packages
 library(tidyverse)
 library(gender)

 # load in dataset
 # do some wrangling
 d <- read.csv('path_to_refs/reference_list.csv', header = FALSE) %>%
  # rename columns
  rename(reference = V1, authors = V2) %>%
  # grab reference number, change year to !, replace everything after first ! to '', replace ' and' to ''
  mutate(reference = parse_number(reference),
         authors = gsub('[0-9]', '!', authors),
         authors = gsub('!.*', '', authors),
         authors = gsub(' and', ',', authors)) %>%
  # stack all authors from the same reference
  separate_rows(., authors, sep = ',') %>%
  # renmove empty rows
  filter(., authors != '') %>%
  group_by(reference) %>%
  # add loose naming strategies for first and last author, remove white space from names
  mutate(order = 1:n(),
         author_info = case_when(order == 1 ~ 'first_author',
                                 order == max(order) ~ 'last_author',
                                 TRUE ~ 'co_author'),
         authors = trimws(authors)) %>%
  ungroup()

 # grab first name
 d <- mutate(d, first_name = gsub('\\ .*', '', authors))

 # drop samples with only initial as first name
 d2 <- filter(d, !str_detect(first_name, '\\.'))
  
 # predict gender based on first name
 d2 <- group_by(d2, reference, order, author_info) %>%
  nest() %>%
  mutate(gender_predict = map(data, ~gender(.x$first_name, method = 'genderize')))

 # get predictions
 d2 <- d2 %>% unnest(c(data, gender_predict))

 # remove NAs where the prediction has not worked
 d3 <- filter(d2, !is.na(gender)) %>%
  ungroup()

 # % last authors that are women
 last_authors <- filter(d3, author_info == 'last_author') %>%
  mutate(refs = n()) %>%
  group_by(gender) %>%
  summarise(prop = n()/unique(refs), .groups = 'drop')

 # % first authors that are women
 first_authors <- filter(d3, author_info == 'first_author') %>%
  mutate(refs = n()) %>%
  group_by(gender) %>%
  summarise(prop = n()/unique(refs), .groups = 'drop')

 # % all authors that are women
 all_authors <- mutate(d3, n_authors = n()) %>%
  group_by(gender) %>%
  summarise(prop = n()/unique(n_authors), .groups = 'drop')

 # % references with a female/male author somewhere
 papers_with_female_male <- mutate(d3, refs = length(unique(reference))) %>%
  select(reference, gender, refs) %>%
  distinct() %>%
  group_by(gender) %>%
  summarise(prop = n()/unique(refs))

 # create output dataframe
 output <- tibble(application = 'nerc_irf_2021',
                 percent_refs_with_female = papers_with_female_male$prop[1] * 100,
                 percent_refs_with_male = papers_with_female_male$prop[2] * 100,
                 first_author_female = first_authors$prop[1] * 100,
                 last_author_female = last_authors$prop[1] * 100,
                 all_authors_female = all_authors$prop[1] * 100)
	# look at gender balance of references

	# load packages
	library(tidyverse)
	library(gender)

	# load in dataset
	# do some wrangling
	d <- read.csv('path_to_refs/reference_list.csv', header = FALSE) %>%
	# rename columns
	rename(reference = V1, authors = V2) %>%
	# grab reference number, change year to !, replace everything after first ! to '', replace ' and' to ''
	mutate(reference = parse_number(reference),
	authors = gsub('[0-9]', '!', authors),
	authors = gsub('!.*', '', authors),
	authors = gsub(' and', ',', authors)) %>%
	# stack all authors from the same reference
	separate_rows(., authors, sep = ',') %>%
	# renmove empty rows
	filter(., authors != '') %>%
	group_by(reference) %>%
	# add loose naming strategies for first and last author, remove white space from names
	mutate(order = 1:n(),
	author_info = case_when(order == 1 ~ 'first_author',
	order == max(order) ~ 'last_author',
	TRUE ~ 'co_author'),
	authors = trimws(authors)) %>%
	ungroup()

	# grab first name
	d <- mutate(d, first_name = gsub('\\ .*', '', authors))

	# drop samples with only initial as first name
	d2 <- filter(d, !str_detect(first_name, '\\.'))

	# predict gender based on first name
	d2 <- group_by(d2, reference, order, author_info) %>%
	nest() %>%
	mutate(gender_predict = map(data, ~gender(.x$first_name, method = 'genderize')))

	# get predictions
	d2 <- d2 %>% unnest(c(data, gender_predict))

	# remove NAs where the prediction has not worked
	d3 <- filter(d2, !is.na(gender)) %>%
	ungroup()

	# % last authors that are women
	last_authors <- filter(d3, author_info == 'last_author') %>%
	mutate(refs = n()) %>%
	group_by(gender) %>%
	summarise(prop = n()/unique(refs), .groups = 'drop')

	# % first authors that are women
	first_authors <- filter(d3, author_info == 'first_author') %>%
	mutate(refs = n()) %>%
	group_by(gender) %>%
	summarise(prop = n()/unique(refs), .groups = 'drop')

	# % all authors that are women
	all_authors <- mutate(d3, n_authors = n()) %>%
	group_by(gender) %>%
	summarise(prop = n()/unique(n_authors), .groups = 'drop')

	# % references with a female/male author somewhere
	papers_with_female_male <- mutate(d3, refs = length(unique(reference))) %>%
	select(reference, gender, refs) %>%
	distinct() %>%
	group_by(gender) %>%
	summarise(prop = n()/unique(refs))

	# create output dataframe
	output <- tibble(application = 'nerc_irf_2021',
	percent_refs_with_female = papers_with_female_male$prop[1] * 100,
	percent_refs_with_male = papers_with_female_male$prop[2] * 100,
	first_author_female = first_authors$prop[1] * 100,
	last_author_female = last_authors$prop[1] * 100,
	all_authors_female = all_authors$prop[1] * 100)