thoughtfulbloke · August 5, 2017 02:15
diff --git a/catsAndDogs.R b/catsAndDogs.R
 library(tidypvals)
 library(dplyr)
 library(fulltext)
 library(tidytext)
 library(tidyr)
 library(ggplot)
 library(parallel)

 hasDOI <- allp %>% filter(!is.na(doi), operator == "equals")
 plosDOI <- hasDOI[grep("pone", hasDOI$doi),]

 # in theory error handling isn't needed if only targetting PLOS articles
 fltx_seek_abstract <- function(x){
  tryCatch(ft_abstract(x)[["plos"]][[1]][["abstract"]],
           error = function(c) NA,
           warning = function(c) NA,
           message = function(c) NA
  )
 }

 # rather than using my download individuals code, I strongly urge 
 # people to have a look at Scott Chamberlain's (@sckottie) GIST
 # https://gist.github.com/sckott/ee32e58b0b2fe2f722a5b5112234c893
 plosDOI$Absfltx <- sapply(plosDOI$doi, fltx_seek_abstract)


 #I have just spent 36 hours with the computer downloading 348542 abstracts,
 # Let's save what I have done
 save(plosDOI, file="PLOSDOI.RData")
 rm(hasDOI)

 cats_dogs_living_together <- plosDOI %>% select(doi, pvalue, Absfltx) %>%
  unnest_tokens(word, Absfltx) %>%
  mutate(cats = word == "cats" |
           word == "cat" |
           word == "feline" |
           word == "felines",
         dogs = word == "dog" |
           word == "dogs" |
           word == "canine" |
           word == "canines",
         teeth = word == "tooth" | #canine can refer to teeth
           word == "teeth" |
           word == "dentistry" |
           word == "dental" |
           word == "oral") %>%
  group_by(doi, pvalue) %>%
  summarise(is_cat = sum(cats) > 0,
            is_dog = sum(dogs) > 0,
            is_teeth = sum(teeth) > 0) %>% ungroup() %>% 
  filter(!is_teeth & (is_cat | is_dog)) %>% select(-is_teeth) %>%
  gather(animal,relevence, is_cat:is_dog) %>% filter(relevence) %>%
  select(-relevence)

 ggplot(cats_dogs_living_together, aes(x=pvalue, colour=animal)) + geom_density() +
  ggtitle("Cats vs Dogs pvalues", subtitle = "source: Abstracts of 2667 PLOS articles")

 cats_dogs_living_together %>% group_by(animal) %>%
  summarise(mpv = mean(pvalue), number = n())
 # 757 cat studies, 1910 dog studies suggests dogs are just easier to do experiments on
 cats_dogs_living_together %>% group_by(animal) %>%
  summarise(mpv = mean(pvalue), number = n()) %>% summarise(diff_in_mean = mpv[1]-mpv[2])
 # difference in mean pvals 0.02344332

 # significance test- irregualr distribution so will use simulation.
 # if we assume that cats and dogs are being drawn from a common distrubtion
 # of "research on household pets" that is represented by the combined distribution
 # how likely is the observed difference in means or more likely to occur by chance

 num_sim <- 10000000
 a_diff_of_means <- function(x,pvals){
  catdog <- sample(pvals, 2667, replace=TRUE)
  abs(mean(catdog[1:757]) - mean(catdog[758:2667]))
 }
 pvalsVec <- cats_dogs_living_together$pvalue

 # this is going parallel for doing a lot of simulations YMMV,
 # but at 10000000 my computers fan revs up and the room warms

 # Calculate the number of cores
 no_cores <- detectCores() - 1
 # Initiate cluster
 cl <- makeCluster(no_cores)
 clusterExport(cl, varlist=c("num_sim", "a_diff_of_means", "pvalsVec"))
 sim <- parSapply(cl, 1:num_sim,a_diff_of_means, pvals=pvalsVec)
 stopCluster(cl)

 sum(sim >= 0.02344332) / num_sim
	library(tidypvals)
	library(dplyr)
	library(fulltext)
	library(tidytext)
	library(tidyr)
	library(ggplot)
	library(parallel)

	hasDOI <- allp %>% filter(!is.na(doi), operator == "equals")
	plosDOI <- hasDOI[grep("pone", hasDOI$doi),]

	# in theory error handling isn't needed if only targetting PLOS articles
	fltx_seek_abstract <- function(x){
	tryCatch(ft_abstract(x)[["plos"]][[1]][["abstract"]],
	error = function(c) NA,
	warning = function(c) NA,
	message = function(c) NA
	)
	}

	# rather than using my download individuals code, I strongly urge
	# people to have a look at Scott Chamberlain's (@sckottie) GIST
	# https://gist.github.com/sckott/ee32e58b0b2fe2f722a5b5112234c893
	plosDOI$Absfltx <- sapply(plosDOI$doi, fltx_seek_abstract)


	#I have just spent 36 hours with the computer downloading 348542 abstracts,
	# Let's save what I have done
	save(plosDOI, file="PLOSDOI.RData")
	rm(hasDOI)

	cats_dogs_living_together <- plosDOI %>% select(doi, pvalue, Absfltx) %>%
	unnest_tokens(word, Absfltx) %>%
	mutate(cats = word == "cats" \|
	word == "cat" \|
	word == "feline" \|
	word == "felines",
	dogs = word == "dog" \|
	word == "dogs" \|
	word == "canine" \|
	word == "canines",
	teeth = word == "tooth" \| #canine can refer to teeth
	word == "teeth" \|
	word == "dentistry" \|
	word == "dental" \|
	word == "oral") %>%
	group_by(doi, pvalue) %>%
	summarise(is_cat = sum(cats) > 0,
	is_dog = sum(dogs) > 0,
	is_teeth = sum(teeth) > 0) %>% ungroup() %>%
	filter(!is_teeth & (is_cat \| is_dog)) %>% select(-is_teeth) %>%
	gather(animal,relevence, is_cat:is_dog) %>% filter(relevence) %>%
	select(-relevence)

	ggplot(cats_dogs_living_together, aes(x=pvalue, colour=animal)) + geom_density() +
	ggtitle("Cats vs Dogs pvalues", subtitle = "source: Abstracts of 2667 PLOS articles")

	cats_dogs_living_together %>% group_by(animal) %>%
	summarise(mpv = mean(pvalue), number = n())
	# 757 cat studies, 1910 dog studies suggests dogs are just easier to do experiments on
	cats_dogs_living_together %>% group_by(animal) %>%
	summarise(mpv = mean(pvalue), number = n()) %>% summarise(diff_in_mean = mpv[1]-mpv[2])
	# difference in mean pvals 0.02344332

	# significance test- irregualr distribution so will use simulation.
	# if we assume that cats and dogs are being drawn from a common distrubtion
	# of "research on household pets" that is represented by the combined distribution
	# how likely is the observed difference in means or more likely to occur by chance

	num_sim <- 10000000
	a_diff_of_means <- function(x,pvals){
	catdog <- sample(pvals, 2667, replace=TRUE)
	abs(mean(catdog[1:757]) - mean(catdog[758:2667]))
	}
	pvalsVec <- cats_dogs_living_together$pvalue

	# this is going parallel for doing a lot of simulations YMMV,
	# but at 10000000 my computers fan revs up and the room warms

	# Calculate the number of cores
	no_cores <- detectCores() - 1
	# Initiate cluster
	cl <- makeCluster(no_cores)
	clusterExport(cl, varlist=c("num_sim", "a_diff_of_means", "pvalsVec"))
	sim <- parSapply(cl, 1:num_sim,a_diff_of_means, pvals=pvalsVec)
	stopCluster(cl)

	sum(sim >= 0.02344332) / num_sim