Skip to content

Instantly share code, notes, and snippets.

@hpiwowar
Created May 30, 2011 15:15
Show Gist options
  • Save hpiwowar/999040 to your computer and use it in GitHub Desktop.
Save hpiwowar/999040 to your computer and use it in GitHub Desktop.
Stats for the analysis and graphics in Piwowar ASIS&T 2011 submission "Finding the hold-outs"
### Heather Piwowar
### MIT open license.
### blog post about interm results:
library(rms)
# if true, draw figures on screen instead of in a file
SCREEN=TRUE
# set to ".eps" or ".png"
FORMAT=".pdf"
# PLoS standards from http://www.plosone.org/static/figureGuidelines.action#quickref
MAX.FIGURE.WIDTH = 6.83
MAX.FIGURE.HEIGHT = 8
setup.figure = function
### Set up a graphics object for plotting a journal-ready .eps figure
### call this, then do the plot, then call dev.off()
(
filename, ##<< filename for resulting eps file. Will get .eps or .png or .pdf appended
width=6.83, ##<< figure width, in inches
height=8, ##<< figure height, in inches
format=".eps", ##<< ".eps" or ".png" or ".pdf"
screen=FALSE
) {
if (screen) {
quartz()
} else {
filename.with.extension = paste(filename, format, sep="")
if (format==".eps") {
postscript(file=filename.with.extension,
paper="special",
width=width,
height=height,
family="sans",
onefile=FALSE,
pagecentre=FALSE,
horizontal=FALSE)
} else if (format==".png") {
png(filename.with.extension)
} else if (format==".pdf") {
pdf(filename.with.extension)
}
}
return()
##examples<<
setup.figure("figure1", 6, 8)
plot(c(1, 2, 3), c(2, 4, 6))
close.figure("figure1")
}
close.figure = function
### Closes a graphics object and embeds fonts in he associated .eps figure
### To be used with setup.figure
(
filename, ##<< filename of eps file. Will get .eps appended
format=".eps", ##<< ".eps" or ".png"
screen=FALSE
) {
if (!screen) {
dev.off()
if (format==".eps") {
filename.with.extension = paste(filename, ".eps", sep="")
embedFonts(filename.with.extension, outfile=filename.with.extension)
}
}
return()
##examples<<
setup.figure("figure1", 6, 8)
plot(c(1, 2, 3), c(2, 4, 6))
close.figure("figure1")
}
read.raw.data = function
### Read the raw text data file
### change underscores in column names to periods
### return as a data frame
(
filename="rawdata.txt" ##<< name of raw data
)
{
dat.raw = read.csv(filename, header=TRUE, sep="\t", stringsAsFactors=FALSE)
names(dat.raw) = gsub("_", ".", names(dat.raw))
#names.pretty = cat(names(dat.raw), sep="\n")
#print names.pretty
#save(dat.raw, file="dat_raw.Rdata")
#load("dat_raw.Rdata")
#dim(dat.raw)
#summary(dat.raw)
#names(dat.raw)
#names(dat.raw)[0:20]
return(dat.raw)
}
# Don't do transformations for this analysis. Use ranks, etc.
tr = function(x) return(x)
log.tr = function(x) return(x)
#### Some of my variables are extracted by looking to see if MEDLINE records have a MeSH term.
# This is fine, but sometimes MEDLINE records are incomplete, they are in process or aren't on the list
# of journals indexed by MeSH indexers. This means a lack of relevant MeSH term does not mean the MeSH
# term does not apply. Thus, must replace these 0s with NAs to indicate the data is indeed missing
medline.values = function(raw_values, medline_status) {
values = raw_values
values[medline_status!="indexed for MEDLINE"] = NA
return(values)
}
preprocess.raw.data = function
### Do preprocessing on raw data
### change variables to ordered factors, integers, etc as appropriate
### return cleaned data as a data frame
(
dat.raw ##<< data frame containing raw data
)
{
#### CONVERT STRINGS TO NUMBERS
dat.nums = get.dat.nums(dat.raw) ##<<details this produces warnings
#library(psych)
#described.dat.nums = psych::describe(dat.nums)
#write.table(described.dat.nums,"described_dat_nums.txt",append=F,quote=F,sep="\t",row.names=T)
#### START TO BUILD STATS DATASET
dat = data.frame(pmid = as.numeric(dat.nums$pmid))
# skip issn
# skip essn
# My naming convention is that "ago" means number of years between 2010 and the original variable
dat$years.ago = tr(2010 - dat.nums$pubmed.year.published)
# skip date_in_pubmed
# skip authors
# Break journal into top contenders
#dat$journal.bmc.genomics = ifelse("BMC Genomics" == dat.raw$pubmed.journal, 1, 0)
#dat$journal.cancer.res = ifelse("Cancer Res" == dat.raw$pubmed.journal, 1, 0)
#dat$journal.pnas = ifelse("Proc Natl Acad Sci U S A" == dat.raw$pubmed.journal, 1, 0)
#dat$journal.j.biol.chem = ifelse("J Biol Chem" == dat.raw$pubmed.journal, 1, 0)
#dat$journal.physiol.genomics = ifelse("Physiol Genomics" == dat.raw$pubmed.journal, 1, 0)
#dat$journal.plos.one = ifelse("PLoS One" == dat.raw$pubmed.journal, 1, 0)
dat$num.authors = tr(dat.nums$pubmed.number.authors)
####### FIRST AUTHOR VARIABLES
# skip first_author_first_name
dat$first.author.female = ordered(dat.nums$author.first.author.female)
dat$first.author.female[which(pmax(dat.nums$author.first.author.male, dat.nums$author.first.author.female) <= 0)] = NA
# Removing male because so highly correlated with female
#dat$first.author.male = ordered(dat.nums$author.first.author.male)
dat$first.author.gender.not.found = ordered(dat.nums$author.first.author.gender.not.found)
dat$first.author.num.prev.pubs = tr(dat.nums$first.author.num.prev.pubs)
dat$first.author.num.prev.pmc.cites = tr(dat.nums$first.author.num.prev.pmc.cites)
dat$first.author.year.first.pub.ago = tr( 2010 - dat.nums$first.author.year.first.pub)
# NAs are either because there was no cluster, or the cluster included no prior papers
# (unfortunately, I didn't collect an indication of which was true, but in theory there should
# be a cluster for all papers published before 2009 Author-ity data was run, so I will
# assume it is the latter, that there we no prior papers.)
# I will code no prior papers as zero years since the prior paper
dat$first.author.year.first.pub.ago[is.na(dat.nums$first.author.year.first.pub)] = tr(0)
dat$first.author.num.prev.microarray.creations = tr(dat.nums$first.author.num.prev.microarray.creations)
dat$first.author.num.prev.oa = tr(dat.nums$first.author.num.prev.oa)
dat$first.author.num.prev.other.sharing = tr(
dat.nums$first.author.num.prev.genbank.sharing +
dat.nums$first.author.num.prev.pdb.sharing +
dat.nums$first.author.num.prev.swissprot.sharing)
dat$first.author.num.prev.geoae.sharing = tr(dat.nums$first.author.num.prev.geoae.sharing)
dat$first.author.num.prev.geo.reuse = tr(dat.nums$first.author.num.prev.geo.reuse)
dat$first.author.num.prev.multi.center = tr(dat.nums$first.author.num.prev.multi.center)
# skip first.author.prev.meta.analysis because not enough > 0
# set to NA if the number of publications is unrealistically big, because this probably means that
# Author-ity combined two or more authors together. Better to set to NA and deal with bias than introduce
# such large numbers into our data
first.author.thresh.cluster.size = quantile(dat$first.author.num.prev.pubs, .98, na.rm=TRUE)
first.author.unrealistic.pub.clusters = which(dat$first.author.num.prev.pubs > first.author.thresh.cluster.size)
dat$first.author.num.prev.pubs[first.author.unrealistic.pub.clusters] = NA
dat$first.author.num.prev.pmc.cites[first.author.unrealistic.pub.clusters] = NA
dat$first.author.year.first.pub.ago[first.author.unrealistic.pub.clusters] = NA
dat$first.author.num.prev.microarray.creations[first.author.unrealistic.pub.clusters] = NA
dat$first.author.num.prev.oa[first.author.unrealistic.pub.clusters] = NA
dat$first.author.num.prev.other.sharing [first.author.unrealistic.pub.clusters] = NA
dat$first.author.num.prev.geoae.sharing[first.author.unrealistic.pub.clusters] = NA
dat$first.author.num.prev.geo.reuse [first.author.unrealistic.pub.clusters] = NA
dat$first.author.num.prev.multi.center[first.author.unrealistic.pub.clusters] = NA
####### LAST AUTHOR VARIABLES
# skip last_author_first_name
dat$last.author.female = ordered(dat.nums$last.first.author.female)
dat$last.author.female[which(pmax(dat.nums$last.first.author.male, dat.nums$last.first.author.female) <= 0)] = NA
# Removing male because so highly correlated with female
#dat$last.author.male = ordered(dat.nums$last.first.author.male)
dat$last.author.gender.not.found = ordered(dat.nums$last.first.author.gender.not.found)
dat$last.author.num.prev.pubs = tr(dat.nums$last.author.num.prev.pubs)
dat$last.author.num.prev.pmc.cites = tr(dat.nums$last.author.num.prev.pmc.cites)
dat$last.author.year.first.pub.ago = tr(2010 - dat.nums$last.author.year.first.pub)
# See comment on first author above for treatment of NAs for this variable
dat$last.author.year.first.pub.ago[is.na(dat.nums$last.author.year.first.pub)] = tr(0)
dat$last.author.num.prev.microarray.creations = tr(dat.nums$last.author.num.prev.microarray.creations)
dat$last.author.num.prev.oa = tr(dat.nums$last.author.num.prev.oa)
dat$last.author.num.prev.other.sharing = tr(
dat.nums$last.author.num.prev.genbank.sharing +
dat.nums$last.author.num.prev.pdb.sharing +
dat.nums$last.author.num.prev.swissprot.sharing )
dat$last.author.num.prev.geoae.sharing = tr(dat.nums$last.author.num.prev.geoae.sharing)
dat$last.author.num.prev.geo.reuse = tr(dat.nums$last.author.num.prev.geo.reuse)
dat$last.author.num.prev.multi.center = tr(dat.nums$last.author.num.prev.multi.center)
# skip last.author.prev.meta.analysis because not enough > 0
# set to NA if the number of publications is unrealistically big, because this probably means that
# Author-ity combined two or more authors together. Better to set to NA and deal with bias than introduce
# such large numbers into our data
last.author.thresh.cluster.size = quantile(dat$last.author.num.prev.pubs, .98, na.rm=TRUE)
last.author.unrealistic.pub.clusters = which(dat$last.author.num.prev.pubs >= last.author.thresh.cluster.size)
dat$last.author.num.prev.pubs[last.author.unrealistic.pub.clusters] = NA
dat$last.author.num.prev.pmc.cites[last.author.unrealistic.pub.clusters] = NA
dat$last.author.year.first.pub.ago[last.author.unrealistic.pub.clusters] = NA
dat$last.author.num.prev.microarray.creations[last.author.unrealistic.pub.clusters] = NA
dat$last.author.num.prev.oa[last.author.unrealistic.pub.clusters] = NA
dat$last.author.num.prev.other.sharing [last.author.unrealistic.pub.clusters] = NA
dat$last.author.num.prev.geoae.sharing[last.author.unrealistic.pub.clusters] = NA
dat$last.author.num.prev.geo.reuse [last.author.unrealistic.pub.clusters] = NA
dat$last.author.num.prev.multi.center[last.author.unrealistic.pub.clusters] = NA
#### SOME FUNDING VARIABLES AND STUDY VARIABLES
# skip address
# skip institution, use instition_clean instead below
# skip country, use country_clean instead below
# skip grant_numbers, use lots of other related variables below
dat$num.grant.numbers = tr(dat.nums$num.grant.numbers)
dat$num.nih.is.nci = tr(dat.nums$nih.is.nci)
dat$num.nih.is.nhlbi = tr(dat.nums$nih.is.nhlbi)
dat$num.nih.is.ncrr = tr(dat.nums$nih.is.ncrr)
dat$num.nih.is.niehs = tr(dat.nums$nih.is.niehs)
dat$num.nih.is.ninds = tr(dat.nums$nih.is.ninds)
dat$num.nih.is.niddk = tr(dat.nums$nih.is.niddk)
dat$num.nih.is.nigms = tr(dat.nums$nih.is.nigms)
dat$num.nih.is.niaid = tr(dat.nums$nih.is.niaid)
# skip is_review because all 0s, as it should be
# need to use pubmed.medline.status to figure out what should really be NA because of incomplete mesh terms
dat$pubmed.is.humans = ordered(medline.values(dat.nums$pubmed.is.humans, dat.raw$pubmed.medline.status))
dat$pubmed.is.animals = ordered(medline.values(dat.nums$pubmed.is.animals, dat.raw$pubmed.medline.status))
dat$pubmed.is.mice = ordered(medline.values(dat.nums$pubmed.is.mice, dat.raw$pubmed.medline.status))
dat$pubmed.is.fungi = ordered(medline.values(dat.nums$pubmed.is.fungi, dat.raw$pubmed.medline.status))
dat$pubmed.is.bacteria = ordered(medline.values(dat.nums$pubmed.is.bacteria, dat.raw$pubmed.medline.status))
dat$pubmed.is.plants = ordered(medline.values(dat.nums$pubmed.is.plants, dat.raw$pubmed.medline.status))
dat$pubmed.is.viruses = ordered(medline.values(dat.nums$pubmed.is.viruses, dat.raw$pubmed.medline.status))
dat$pubmed.is.cultured.cells = ordered(medline.values(dat.nums$pubmed.is.cultured.cells, dat.raw$pubmed.medline.status))
# is cancer appears to depend not entirely on MEDLINE completeness
dat$pubmed.is.cancer = ordered(dat.nums$pubmed.is.cancer)
# Open access filter, and some others below, are by journal not article, so doesn't depend on MEDLINE completion status
dat$pubmed.is.open.access = ordered(dat.nums$pubmed.is.open.access)
dat$pubmed.is.effectiveness = ordered(medline.values(dat.nums$pubmed.is.effectiveness, dat.raw$pubmed.medline.status))
dat$pubmed.is.diagnosis = ordered(medline.values(dat.nums$pubmed.is.diagnosis, dat.raw$pubmed.medline.status))
dat$pubmed.is.prognosis = ordered(medline.values(dat.nums$pubmed.is.prognosis, dat.raw$pubmed.medline.status))
dat$pubmed.is.core.clinical.journal = ordered(dat.nums$pubmed.is.core.clinical.journal)
# skip is_clinical_trial because not enough
# skip is_randomized_clinical_trial because not enough
# skip is_meta_analysis because none (yay!)
dat$pubmed.is.comparative.study = ordered(medline.values(dat.nums$pubmed.is.comparative.study, dat.raw$pubmed.medline.status))
# skip multicenter because not enough (surprising?)
# skip validation because not enough (surprising?)
# skip funded_stimulus because all 0s
# external funding seems set sometimes even for 1s, so doesn't depend on MEDLINE completely
# commenting out external in favour of a combined nih below
#dat$pubmed.is.funded.nih.extramural = ordered(dat.nums$pubmed.is.funded.nih.extramural)
dat$pubmed.is.funded.nih.intramural = ordered(dat.nums$pubmed.is.funded.nih.intramural)
# Combining extramural, intramural, and whether the PMID had a link to funding in the NIH data sources
dat$pubmed.is.funded.nih = ordered(pmax(dat.nums$pubmed.is.funded.nih.extramural, dat.nums$pubmed.is.funded.nih.intramural, !is.na(dat.nums$num.grants)))
dat$pubmed.is.funded.non.us.govt = ordered(dat.nums$pubmed.is.funded.non.us.govt)
# Sharing in PDB and Swissprot very small, so will combine
dat$pubmed.is.shared.other = ordered(pmax(dat.nums$pubmed.in.genbank, dat.nums$pubmed.in.pdb, dat.nums$pubmed.in.swissprot))
# geo.reuse is tiny but I want to use it anyway!
dat$pubmed.is.geo.reuse = ordered(dat.nums$is.geo.reuse)
dat$pubmed.num.cites.from.pmc = tr(dat.nums$pubmed.number.times.cited.in.pmc)
dat$pubmed.num.cites.from.pmc.per.year = dat.nums$pubmed.number.times.cited.in.pmc/(2010 - dat.nums$pubmed.year.published)
# Comment these out for now, because they look wrong
#dat$found.by.highwire = ordered(dat.nums$found.by.highwire)
#dat$found.by.scirus = ordered(dat.nums$found.by.highwire)
#dat$found.by.googlescholar = ordered(dat.nums$found.by.highwire)
#dat$found.by.pmc = ordered(dat.nums$portal.pmids.found.by.pmc)
##### DEPENDENT VARIABLES
dat$dataset.in.geo = ordered(dat.nums$pubmed.in.geo)
# commenting this out in favour of a combined metric below
#dat$dataset.in.arrayexpress = ordered(dat.nums$in.arrayexpress)
dat$dataset.in.geo.or.ae = ordered(dat.nums$in.ae.or.geo)
###### INSTITUTION
dat$country.australia = ordered(ifelse("Australia" == dat.raw$country.clean, 1, 0))
dat$country.canada = ordered(ifelse("Canada" == dat.raw$country.clean, 1, 0))
dat$country.china = ordered(ifelse("China" == dat.raw$country.clean, 1, 0))
dat$country.france = ordered(ifelse("France" == dat.raw$country.clean, 1, 0))
dat$country.germany = ordered(ifelse("Germany" == dat.raw$country.clean, 1, 0))
dat$country.japan = ordered(ifelse("Japan" == dat.raw$country.clean, 1, 0))
dat$country.korea = ordered(ifelse("Korea" == dat.raw$country.clean, 1, 0))
dat$country.uk = ordered(ifelse("UK" == dat.raw$country.clean, 1, 0))
dat$country.usa = ordered(ifelse("USA" == dat.raw$country.clean, 1, 0))
dat$institution.is.medical = ordered(dat.nums$institution.hospital.or.medlcal)
dat$institution.rank = dat.nums$institution.rank
dat$institution.is.higher.ed = ordered(ifelse("Higher educ." == dat.raw$institution.sector, 1, 0))
dat$institution.is.higher.ed[which(is.na(dat$institution.rank))] = NA
dat$institution.is.govnt = ordered(ifelse("Government" == dat.raw$institution.sector, 1, 0))
dat$institution.is.govnt[which(is.na(dat$institution.rank))] = NA
dat$institution.research.output = tr(dat.nums$institution.output)
dat$institution.international.collaboration = dat.nums$international.collaboration
dat$institution.mean.norm.impact.factor = dat.nums$institution.norm.sjr
dat$institution.mean.norm.citation.score = dat.nums$institution.norm.citation.score
# Include variables for the three most common institutions
#top.institutions = names(sort(table(dat.institutions$institutions.factor), decreasing=TRUE)[0:25])
#display.institutions = c(top.institutions)
dat$institution.harvard = ordered(ifelse("Harvard University" == dat.raw$institution.clean, 1, 0))
dat$institution.nci = ordered(ifelse("National Cancer Institute" == dat.raw$institution.clean, 1, 0))
dat$institution.stanford = ordered(ifelse("Stanford University" == dat.raw$institution.clean, 1, 0))
### JOURNAL VARIABLES
# skip total cites 2008 because it could be due to so many things, not very informative
#dat$journal.cites.2008 = tr(dat.nums$journal.2008.cites)
dat$journal.impact.factor = log.tr(dat.nums$journal.impact.factor)
# PLoS One is a big part of our sample, and doesn't have an official impact factor yet.
# Important, because it is relatively low, so better to give it our estimate instead of leave missing
# Estimate of 3 comes from http://pbeltrao.blogspot.com/2009/04/guestimating-plos-one-impact-factor.html
dat$journal.impact.factor[which(dat.raw$pubmed.journal== "PLoS One")] = log.tr(3)
dat$journal.5yr.impact.factor = log.tr(dat.nums$journal.5yr.impact.factor)
dat$journal.immediacy.index = log.tr(dat.nums$journal.immediacy.index)
dat$journal.num.articles.2008 = tr(dat.nums$journal.num.articles.2008)
dat$journal.cited.halflife = dat.nums$journal.cited.halflife
# skip journal_policy_try because it is redundant
dat$journal.policy.requires.microarray.accession = ordered(ifelse(dat.nums$journal.policy.requires.microarray.accession > 0, 1, 0))
# says must deposit should be true whenever requires microarray accession is tru
dat$journal.policy.says.must.deposit = ordered(ifelse(pmax(dat.nums$journal.policy.says.must.deposit, dat.nums$journal.policy.requires.microarray.accession) > 0, 1, 0))
dat$journal.policy.at.least.requests.sharing.microarray = ordered(ifelse(dat.nums$journal.policy.at.least.requests.sharing.microarray > 0, 1, 0))
dat$journal.policy.mentions.any.sharing = ordered(ifelse(dat.nums$journal.policy.mentions.any.sharing > 0, 1, 0))
dat$journal.policy.general.statement = ordered(ifelse(dat.nums$journal.policy.general.statement > 0, 1, 0))
# requests accession should be true whenenever requires accession is true, but looks like I didn't code it that way
dat$journal.policy.requests.accession = ordered(ifelse(pmax(dat.nums$journal.policy.requests.accession, dat.nums$journal.policy.requires.microarray.accession) > 0, 1, 0))
dat$journal.policy.mentions.exceptions = ordered(ifelse(dat.nums$journal.policy.mentions.exceptions > 0, 1, 0))
dat$journal.policy.mentions.consequences = ordered(ifelse(dat.nums$journal.policy.mentions.consequences > 0, 1, 0))
dat$journal.policy.contains.word.microarray = ordered(ifelse(dat.nums$journal.policy.contains.word.microarray > 0, 1, 0))
dat$journal.policy.contains.word.miame.mged = ordered(ifelse(dat.nums$journal.policy.contains.word.MIAME.MGED > 0, 1, 0))
dat$journal.policy.contains.word.arrayexpress = ordered(ifelse(dat.nums$journal.policy.contains.word.arrayexpress > 0, 1, 0))
dat$journal.policy.contains.word.geo.omnibus = ordered(ifelse(dat.nums$journal.policy.contains.word.geo.omnibus > 0, 1, 0))
dat$journal.policy.requests.sharing.other.data = ordered(ifelse(dat.nums$journal.policy.requests.sharing.other.data > 0, 1, 0))
# For some reason this variable didn't have NAs, but it should have the same NAs as the other journal policy vars
dat$journal.policy.requests.sharing.other.data[is.na(dat$journal.policy.requests.accession)] = NA
# Add a variable saying how many microarray creating (as defined by being in this set)
# papers the journal has published
table.journal.counts = table(dat.raw$pubmed.journal)
dat$journal.microarray.creating.count = tr(as.numeric(table.journal.counts[dat.raw$pubmed.journal]))
#### MORE FUNDING VARIABLES
# set the missing values to 0 or tr(0), as appropriate
# skip pmid:1 as duplicate column
dat$num.grants.via.nih = tr(dat.nums$num.grants)
dat$num.grants.via.nih[is.na(dat$num.grants.via.nih)] = tr(0)
dat$max.grant.duration = tr(dat.nums$longest.num.years)
dat$max.grant.duration[is.na(dat$max.grant.duration)] = tr(0)
dat$nih.first.year.ago = tr(2010 - dat.nums$first.year)
dat$nih.first.year.ago[is.na(dat$nih.first.year.ago)] = NA
dat$nih.last.year.ago = tr(2010 - dat.nums$last.year)
dat$nih.last.year.ago[is.na(dat$nih.last.year.ago)] = NA
dat$nih.cumulative.years = tr(dat.nums$cumulative.years)
dat$nih.cumulative.years[is.na(dat$nih.cumulative.years)] = tr(0)
dat$nih.sum.avg.dollars = tr(dat.nums$sum.avg.dollars)
dat$nih.sum.avg.dollars[is.na(dat$nih.sum.avg.dollars)] = tr(0)
dat$nih.sum.sum.dollars = tr(dat.nums$sum.sum.dollars)
dat$nih.sum.sum.dollars[is.na(dat$nih.sum.sum.dollars)] = tr(0)
dat$nih.max.max.dollars = tr(dat.nums$max.max.dollars)
dat$nih.max.max.dollars[is.na(dat$nih.max.max.dollars)] = tr(0)
# skip
# skip grant_activity_codes list, because captured below
dat$has.R01.funding = ordered(dat.nums$has.R01.funding)
dat$has.R01.funding[is.na(dat$has.R01.funding)] = 0
#dat$has.T32.funding = ordered(dat.nums$has.T32.funding)
#dat$has.T32.funding[is.na(dat$has.T32.funding)] = 0
# other funding types dropped because don't occurr often enough
# look for patterns here
dat$has.P.funding = ordered(ifelse(grepl("P", dat.raw$grant.activity.codes), 1, 0))
dat$has.R.funding = ordered(ifelse(grepl("R", dat.raw$grant.activity.codes), 1, 0))
dat$has.T.funding = ordered(ifelse(grepl("T", dat.raw$grant.activity.codes), 1, 0))
dat$has.U.funding = ordered(ifelse(grepl("U", dat.raw$grant.activity.codes), 1, 0))
dat$has.K.funding = ordered(ifelse(grepl("K", dat.raw$grant.activity.codes), 1, 0))
# see below this block for handling of their NAs
# only pick one funding level, otherwise too singular
# not enough post2007
dat$num.post2003.morethan500k = tr(dat.nums$num.post2003.morethan500k)
dat$num.post2004.morethan500k = tr(dat.nums$num.post2004.morethan500k)
dat$num.post2005.morethan500k = tr(dat.nums$num.post2005.morethan500k)
dat$num.post2006.morethan500k = tr(dat.nums$num.post2006.morethan500k)
#dat$num.post2007.morethan500k = tr(dat.nums$num.post2007.morethan500k)
dat$num.post2003.morethan750k = tr(dat.nums$num.post2003.morethan750k)
dat$num.post2004.morethan750k = tr(dat.nums$num.post2004.morethan750k)
dat$num.post2005.morethan750k = tr(dat.nums$num.post2005.morethan750k)
dat$num.post2006.morethan750k = tr(dat.nums$num.post2006.morethan750k)
#dat$num.post2007.morethan750k = tr(dat.nums$num.post2007.morethan750k)
dat$num.post2003.morethan1000k = tr(dat.nums$num.post2003.morethan1000k)
dat$num.post2004.morethan1000k = tr(dat.nums$num.post2004.morethan1000k)
dat$num.post2005.morethan1000k = tr(dat.nums$num.post2005.morethan1000k)
dat$num.post2006.morethan1000k = tr(dat.nums$num.post2006.morethan1000k)
#dat$num.post2007.morethan1000k = tr(dat.nums$num.post2007.morethan1000k)
dat$num.post2003.morethan500k[is.na(dat$num.post2003.morethan500k)] = tr(0)
dat$num.post2004.morethan500k[is.na(dat$num.post2004.morethan500k)] = tr(0)
dat$num.post2005.morethan500k[is.na(dat$num.post2005.morethan500k)] = tr(0)
dat$num.post2006.morethan500k[is.na(dat$num.post2006.morethan500k)] = tr(0)
#dat$num.post2007.morethan500k[is.na(dat$num.post2007.morethan500k)] = tr(0)
dat$num.post2003.morethan750k[is.na(dat$num.post2003.morethan750k)] = tr(0)
dat$num.post2004.morethan750k[is.na(dat$num.post2004.morethan750k)] = tr(0)
dat$num.post2005.morethan750k[is.na(dat$num.post2005.morethan750k)] = tr(0)
dat$num.post2006.morethan750k[is.na(dat$num.post2006.morethan750k)] = tr(0)
#dat$num.post2007.morethan750k[is.na(dat$num.post2007.morethan750k)] = tr(0)
dat$num.post2003.morethan1000k[is.na(dat$num.post2003.morethan1000k)] = tr(0)
dat$num.post2004.morethan1000k[is.na(dat$num.post2004.morethan1000k)] = tr(0)
dat$num.post2005.morethan1000k[is.na(dat$num.post2005.morethan1000k)] = tr(0)
dat$num.post2006.morethan1000k[is.na(dat$num.post2006.morethan1000k)] = tr(0)
#dat$num.post2007.morethan1000k[is.na(dat$num.post2007.morethan1000k)] = tr(0)
dat$dataset.in.geo.or.ae.int = dat.nums$in.ae.or.geo
#dim(dat)
#save(dat, file="dat.Rdata")
return(dat)
}
explore.cleaned.data = function
(
dat
)
{
library(Hmisc)
library(psych)
# both Hmisc and psych have a describe
Hmisc::describe(dat)
#psych::describe(dat)
s = summary(dataset.in.geo.or.ae.int ~ ., dat)
s
}
display.attributes = c(
"num.authors"
,"first.author.female"
,"first.author.gender.not.found"
,"first.author.num.prev.pubs"
,"first.author.num.prev.pmc.cites"
,"first.author.year.first.pub.ago"
,"first.author.num.prev.microarray.creations"
,"first.author.num.prev.oa"
,"first.author.num.prev.other.sharing"
,"first.author.num.prev.geo.reuse"
,"first.author.num.prev.multi.center"
,"last.author.female"
,"last.author.gender.not.found"
,"last.author.num.prev.pubs"
,"last.author.num.prev.pmc.cites"
,"last.author.year.first.pub.ago"
,"last.author.num.prev.microarray.creations"
,"last.author.num.prev.oa"
,"last.author.num.prev.other.sharing"
,"last.author.num.prev.geo.reuse"
,"last.author.num.prev.multi.center"
,"num.grant.numbers"
,"num.nih.is.nci"
,"num.nih.is.nhlbi"
,"num.nih.is.ncrr"
,"num.nih.is.niehs"
,"num.nih.is.ninds"
,"num.nih.is.niddk"
,"num.nih.is.nigms"
,"num.nih.is.niaid"
,"pubmed.is.humans"
,"pubmed.is.animals"
,"pubmed.is.mice"
,"pubmed.is.fungi"
,"pubmed.is.bacteria"
,"pubmed.is.plants"
,"pubmed.is.viruses"
,"pubmed.is.cultured.cells"
,"pubmed.is.cancer"
,"pubmed.is.open.access"
,"pubmed.is.effectiveness"
,"pubmed.is.diagnosis"
,"pubmed.is.prognosis"
,"pubmed.is.core.clinical.journal"
,"pubmed.is.comparative.study"
,"pubmed.is.funded.nih.intramural"
,"pubmed.is.funded.nih"
,"pubmed.is.funded.non.us.govt"
,"pubmed.is.shared.other"
,"pubmed.num.cites.from.pmc"
,"pubmed.num.cites.from.pmc.per.year"
,"dataset.in.geo"
,"dataset.in.geo.or.ae"
,"country.australia"
,"country.canada"
,"country.china"
,"country.france"
,"country.germany"
,"country.japan"
,"country.korea"
,"country.uk"
,"country.usa"
,"institution.is.medical"
,"institution.rank"
,"institution.is.higher.ed"
,"institution.is.govnt"
,"institution.research.output"
,"institution.international.collaboration"
,"institution.mean.norm.impact.factor"
,"institution.mean.norm.citation.score"
,"journal.impact.factor"
,"journal.num.articles.2008"
,"journal.policy.requires.microarray.accession"
,"journal.policy.general.statement"
,"journal.policy.requests.accession"
,"journal.policy.mentions.exceptions"
,"journal.policy.mentions.consequences"
,"journal.policy.contains.word.miame.mged"
,"journal.policy.contains.word.arrayexpress"
,"journal.policy.contains.word.geo.omnibus"
,"journal.policy.requests.sharing.other.data"
,"journal.microarray.creating.count"
,"num.grants.via.nih"
,"max.grant.duration"
,"nih.first.year.ago"
,"nih.last.year.ago"
,"nih.cumulative.years"
,"nih.sum.avg.dollars"
,"nih.sum.sum.dollars"
,"nih.max.max.dollars"
,"has.R01.funding"
,"has.P.funding"
,"has.R.funding"
,"has.T.funding"
,"has.U.funding"
,"has.K.funding"
,"num.post2004.morethan1000k"
,"dataset.in.geo.or.ae.int"
)
make.figure.S1toS5 = function
(
dat.untransformed,
screen=SCREEN,
format=FORMAT
)
{
label(dat.untransformed$first.author.num.prev.microarray.creations) <- "first.author.num.prev.micro.produce"
label(dat.untransformed$last.author.num.prev.microarray.creations) <- "last.author.num.prev.micro.produce"
label(dat.untransformed$journal.policy.at.least.requests.sharing.microarray) <- "journal.policy.requests.or.requires.micro.data"
label(dat.untransformed$journal.policy.contains.word.geo.omnibus) <- "journal.policy.contains.word.geo.omnibus"
# to see formatting, try ?dotchart2 or ?plot.summary.formula.response
mynames = names(dat.untransformed)
sections = 5
section.length = ceil(length(mynames) / sections)
#for (ii in 0:0) {
for (ii in 0:(sections-1)) {
## Check to make sure this is an even divide do don't miss any!
#first.count = ii*section.length
#last.count = min(length(mynames), ((ii+1)*section.length)-1)
#mynames.section = mynames[first.count:last.count]
# group them by topic instead
variable.groups = c("^num.authors|^first|^last",
"journal",
"pubmed",
"nih|grant|num.post|funding",
"country|institution")
mynames.section = mynames[grep(variable.groups[ii+1], mynames)]
mynames.section = intersect(mynames.section, display.attributes)
if ("dataset.in.geo.or.ae.int" %nin% mynames.section) {
mynames.section = c(mynames.section, "dataset.in.geo.or.ae.int")
}
print(ii)
print(mynames.section)
s = summary(dataset.in.geo.or.ae.int ~ ., dat.untransformed[,names(dat.untransformed) %in% mynames.section], continuous = 3)
#s
#quartz()
filename = paste("figureS", ii+1, sep="")
print(filename)
setup.figure(filename, MAX.FIGURE.WIDTH, MAX.FIGURE.HEIGHT, screen=screen, format=format)
par(mai=c(1,5,0.2,1)) # bottom, left, top, right
response = try(
plot.summary.formula.response.CIs(s, width.factor=2, cex.labels=0.4, cex=0.9, xlim=c(0,1), xlab="Proportion of studies with datasets\nfound in GEO or ArrayExpress", main="")
, TRUE)
if (class(response)=="try-error") {
plot(s, width.factor=2, cex.labels=0.4, cex=0.9, xlim=c(0,1), xlab="Proportion of studies with datasets\nfound in GEO or ArrayExpress", main="")
}
close.figure(filename, screen=screen, format=format)
}
}
get.dat.nums = function
( dat.raw )
{
library(plyr)
dat.nums = colwise(as.numeric)(dat.raw) ##<<details this produces warnings
return(dat.nums)
}
make.figure.journals = function
(
dat,
n.journals = 25,
screen=SCREEN,
format=FORMAT
)
{
dat.journals = dat
dat.journals$dataset.in.geo.or.ae.int = as.numeric(dat.journals$in.ae.or.geo)
# Consolidate the less frequent journals into Other
top.journals = names(sort(table(dat.journals$pubmed.journal), decreasing=TRUE)[0:n.journals])
dat.journals$journal.factor = factor(dat.journals$pubmed.journal)
levels(dat.journals$journal.factor)[levels(dat.journals$journal.factor) %nin% top.journals] <- "OTHER"
s = summary(dataset.in.geo.or.ae.int ~ journal.factor, dat.journals)
s
setup.figure("journals", MAX.FIGURE.WIDTH, MAX.FIGURE.WIDTH, screen=screen, format=format)
par(mai=c(1,5,0.2,1)) # bottom, left, top, right
plot.summary.formula.response.CIs(s, width.factor=2, cex.labels=0.5, cex=0.9, xlim=c(0,1), xlab="Proportion of studies with datasets\nfound in GEO or ArrayExpress", main="")
close.figure("journals", screen=screen, format=format)
ps.chisq = c()
for (var in top.journals) {
pv1 <- chisq.test(table(dat.journals$dataset.in.geo.or.ae, dat.journals$journal.factor==var))$p.value
ps.chisq = cbind(ps.chisq, pv1)
}
colnames(ps.chisq) = top.journals
adjusted.ps.chisq = p.adjust(ps.chisq, method = c("none"))
colnames(adjusted.ps.chisq) = colnames(ps.chisq)
print(t(rbind(colnames(adjusted.ps.chisq)[order(adjusted.ps.chisq)], adjusted.ps.chisq[order(adjusted.ps.chisq)])))
}
### RUN IT ALL!
setwd("~/Documents/Projects/ASIST2011/paper")
## The source file below can be found at https://gist.github.com/999036
source("plot.summary.formula.response.CIs.R")
## Dataset was downloaded from http://dx.doi.org/10.5061/dryad.mf1sd/1.
## Please cite use of this data in the references section of publications as
## Piwowar HA (2011) Data from: Who shares? Who doesn’t? Factors associated with openly archiving raw research data. Dryad Digital Repository. doi:10.5061/dryad.mf1sd
raw.data.filename = "rawdata.txt"
dat.raw.all = read.raw.data(raw.data.filename)
##Keep just the data that has journal policy information
dat.raw.policycomplete = dat.raw.all
dat.raw.policycomplete = dat.raw.policycomplete[!is.na(dat.raw.policycomplete$journal.policy.says.must.deposit),]
dat.raw.policycomplete = dat.raw.policycomplete[(dat.raw.policycomplete$pubmed.year.published<2010),]
##Keep just the studies published in journals that require deposition
dat.raw.policymustdeposit = dat.raw.policycomplete[(dat.raw.policycomplete$journal.policy.says.must.deposit>0),]
##Keep just the studies where neither the first nor last author have shared before (by my estimates)
dat.raw.policymustdeposit.nevershared = dat.raw.policymustdeposit[intersect(
which(dat.raw.policymustdeposit$first.author.num.prev.geoae.sharing==0),
which(dat.raw.policymustdeposit$last.author.num.prev.geoae.sharing==0)),]
##Keep just the recent studies, published from 2007 through 2009
dat.raw.policymustdeposit.nevershared.recent = subset(dat.raw.policymustdeposit.nevershared, subset=pubmed.year.published>=2007)
dat.raw = dat.raw.policymustdeposit.nevershared.recent
##Process the data
dat.clean = preprocess.raw.data(dat.raw)
dat.clean = subset(dat.clean, select = -first.author.num.prev.geoae.sharing)
dat.clean = subset(dat.clean, select = -last.author.num.prev.geoae.sharing)
##Display the results
options(digits=4)
options(width=100)
explore.cleaned.data(dat.clean)
make.figure.S1toS5(dat.clean, screen=TRUE, format=FORMAT)
make.figure.journals(dat.raw, 25, screen=TRUE, format=FORMAT)
#### Look at which categorical variables have significant differences
dat = dat.clean
ps.chisq = c()
for (var in names(dat)) {
pv1 <- chisq.test(table(dat[,"dataset.in.geo.or.ae.int"], dat[,var]))$p.value
ps.chisq = cbind(ps.chisq, pv1)
}
colnames(ps.chisq) = names(dat)
# Review the type of p-value multiple comparison adjustment used below before interpreting!
adjusted.ps.chisq = p.adjust(ps.chisq, method = c("none"))
colnames(adjusted.ps.chisq) = colnames(ps.chisq)
varnames = colnames(adjusted.ps.chisq)
ordered.results = order(adjusted.ps.chisq)
print("chi squared p-values, so only look at the results for category variables (review multiple comparison adjustment used below before interpreting!):")
print(t(rbind(varnames[ordered.results], adjusted.ps.chisq[ordered.results])))
#### Look at which continuous variables have significant differences
ps.wilcox = c()
for (var in names(dat)) {
pv1 = NA
try(pv1 <- wilcox.test(dat[,var] ~ dat[,"dataset.in.geo.or.ae.int"])$p.value, TRUE)
ps.wilcox = cbind(ps.wilcox, pv1)
}
colnames(ps.wilcox) = names(dat)
# Review the type of p-value multiple comparison adjustment used below before interpreting!
adjusted.ps.wilcox = p.adjust(ps.wilcox, method = c("none"))
colnames(adjusted.ps.wilcox) = colnames(ps.wilcox)
varnames = colnames(adjusted.ps.wilcox)
ordered.results = order(adjusted.ps.wilcox)
print("wilcox p-values, so only look at the results for continuous variables (review multiple comparison adjustment used below before interpreting!):")
print(t(rbind(varnames[ordered.results], adjusted.ps.wilcox[ordered.results])))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment