Last active
August 5, 2017 02:15
-
-
Save thoughtfulbloke/5aaa4536c3950d1f3307b12554d53a10 to your computer and use it in GitHub Desktop.
Showing analysing a bunch of abstracts using tidypvals, fulltext, and tidy text in R
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidypvals) | |
library(dplyr) | |
library(fulltext) | |
library(tidytext) | |
library(tidyr) | |
library(ggplot) | |
library(parallel) | |
hasDOI <- allp %>% filter(!is.na(doi), operator == "equals") | |
plosDOI <- hasDOI[grep("pone", hasDOI$doi),] | |
# in theory error handling isn't needed if only targetting PLOS articles | |
fltx_seek_abstract <- function(x){ | |
tryCatch(ft_abstract(x)[["plos"]][[1]][["abstract"]], | |
error = function(c) NA, | |
warning = function(c) NA, | |
message = function(c) NA | |
) | |
} | |
# rather than using my download individuals code, I strongly urge | |
# people to have a look at Scott Chamberlain's (@sckottie) GIST | |
# https://gist.github.com/sckott/ee32e58b0b2fe2f722a5b5112234c893 | |
plosDOI$Absfltx <- sapply(plosDOI$doi, fltx_seek_abstract) | |
#I have just spent 36 hours with the computer downloading 348542 abstracts, | |
# Let's save what I have done | |
save(plosDOI, file="PLOSDOI.RData") | |
rm(hasDOI) | |
cats_dogs_living_together <- plosDOI %>% select(doi, pvalue, Absfltx) %>% | |
unnest_tokens(word, Absfltx) %>% | |
mutate(cats = word == "cats" | | |
word == "cat" | | |
word == "feline" | | |
word == "felines", | |
dogs = word == "dog" | | |
word == "dogs" | | |
word == "canine" | | |
word == "canines", | |
teeth = word == "tooth" | #canine can refer to teeth | |
word == "teeth" | | |
word == "dentistry" | | |
word == "dental" | | |
word == "oral") %>% | |
group_by(doi, pvalue) %>% | |
summarise(is_cat = sum(cats) > 0, | |
is_dog = sum(dogs) > 0, | |
is_teeth = sum(teeth) > 0) %>% ungroup() %>% | |
filter(!is_teeth & (is_cat | is_dog)) %>% select(-is_teeth) %>% | |
gather(animal,relevence, is_cat:is_dog) %>% filter(relevence) %>% | |
select(-relevence) | |
ggplot(cats_dogs_living_together, aes(x=pvalue, colour=animal)) + geom_density() + | |
ggtitle("Cats vs Dogs pvalues", subtitle = "source: Abstracts of 2667 PLOS articles") | |
cats_dogs_living_together %>% group_by(animal) %>% | |
summarise(mpv = mean(pvalue), number = n()) | |
# 757 cat studies, 1910 dog studies suggests dogs are just easier to do experiments on | |
cats_dogs_living_together %>% group_by(animal) %>% | |
summarise(mpv = mean(pvalue), number = n()) %>% summarise(diff_in_mean = mpv[1]-mpv[2]) | |
# difference in mean pvals 0.02344332 | |
# significance test- irregualr distribution so will use simulation. | |
# if we assume that cats and dogs are being drawn from a common distrubtion | |
# of "research on household pets" that is represented by the combined distribution | |
# how likely is the observed difference in means or more likely to occur by chance | |
num_sim <- 10000000 | |
a_diff_of_means <- function(x,pvals){ | |
catdog <- sample(pvals, 2667, replace=TRUE) | |
abs(mean(catdog[1:757]) - mean(catdog[758:2667])) | |
} | |
pvalsVec <- cats_dogs_living_together$pvalue | |
# this is going parallel for doing a lot of simulations YMMV, | |
# but at 10000000 my computers fan revs up and the room warms | |
# Calculate the number of cores | |
no_cores <- detectCores() - 1 | |
# Initiate cluster | |
cl <- makeCluster(no_cores) | |
clusterExport(cl, varlist=c("num_sim", "a_diff_of_means", "pvalsVec")) | |
sim <- parSapply(cl, 1:num_sim,a_diff_of_means, pvals=pvalsVec) | |
stopCluster(cl) | |
sum(sim >= 0.02344332) / num_sim |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment