stephenturner · September 2, 2024 10:11
diff --git a/biorxiv-full-text-sentiment.r b/biorxiv-full-text-sentiment.r
 library(tidyverse)
 library(rvest)
 library(tidytext)

 ## Function to get full text given a DOI
 get_full_text <- function(doi) {
  paste0("https://www.biorxiv.org/content/", doi, ".full") |>
    rvest::read_html() |>
    rvest::html_nodes("div.section") |>
    rvest::html_text() |>
    grep("^references", x=_,
         invert=TRUE,
         value=TRUE,
         ignore.case=TRUE) |>
    grep("^bibliography", x=_,
         invert=TRUE,
         value=TRUE,
         ignore.case=TRUE) |>
    paste(collapse="\n\n")
 }

 ## Example: Get full text for a single DOI
 doi <- "10.1101/2023.07.14.549004"
 get_full_text(doi)

 ## Read in the data from https://zenodo.org/doi/10.5281/zenodo.13622640
 rdf <- readRDS("biorxiv-preprints-2014-2023.rds")

 ## Choose topics
 mycats <- c("Bioinformatics",
            "Scientific Communication And Education",
            "Cancer Biology")

 ## Get full text from 50 most recent papers from each
 fulltext_subset <-
  rdf |>
  filter(category %in% mycats) |>
  slice_tail(n=100, by=category) |>
  mutate(fulltext=map_chr(.data$doi, get_full_text))

 ## Get sentiment analysis words
 bingsentiment <- get_sentiments("bing")

 ## Conduct sentiment analysis
 sentiment_results <-
  fulltext_subset |>
  unnest_tokens(word, fulltext) |>
  anti_join(stop_words, by="word") |>
  inner_join(bingsentiment, by="word") |>
  select(category, word, sentiment) |>
  count(category, sentiment) |>
  pivot_wider(names_from=sentiment, values_from=n, values_fill=0) |>
  mutate(total=positive+negative,
         sentiment=positive-negative,
         percent_positive=positive/total) |>
  arrange(percent_positive)
	library(tidyverse)
	library(rvest)
	library(tidytext)

	## Function to get full text given a DOI
	get_full_text <- function(doi) {
	paste0("https://www.biorxiv.org/content/", doi, ".full") \|>
	rvest::read_html() \|>
	rvest::html_nodes("div.section") \|>
	rvest::html_text() \|>
	grep("^references", x=_,
	invert=TRUE,
	value=TRUE,
	ignore.case=TRUE) \|>
	grep("^bibliography", x=_,
	invert=TRUE,
	value=TRUE,
	ignore.case=TRUE) \|>
	paste(collapse="\n\n")
	}

	## Example: Get full text for a single DOI
	doi <- "10.1101/2023.07.14.549004"
	get_full_text(doi)

	## Read in the data from https://zenodo.org/doi/10.5281/zenodo.13622640
	rdf <- readRDS("biorxiv-preprints-2014-2023.rds")

	## Choose topics
	mycats <- c("Bioinformatics",
	"Scientific Communication And Education",
	"Cancer Biology")

	## Get full text from 50 most recent papers from each
	fulltext_subset <-
	rdf \|>
	filter(category %in% mycats) \|>
	slice_tail(n=100, by=category) \|>
	mutate(fulltext=map_chr(.data$doi, get_full_text))

	## Get sentiment analysis words
	bingsentiment <- get_sentiments("bing")

	## Conduct sentiment analysis
	sentiment_results <-
	fulltext_subset \|>
	unnest_tokens(word, fulltext) \|>
	anti_join(stop_words, by="word") \|>
	inner_join(bingsentiment, by="word") \|>
	select(category, word, sentiment) \|>
	count(category, sentiment) \|>
	pivot_wider(names_from=sentiment, values_from=n, values_fill=0) \|>
	mutate(total=positive+negative,
	sentiment=positive-negative,
	percent_positive=positive/total) \|>
	arrange(percent_positive)