Created
May 30, 2011 15:15
-
-
Save hpiwowar/999040 to your computer and use it in GitHub Desktop.
Stats for the analysis and graphics in Piwowar ASIS&T 2011 submission "Finding the hold-outs"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Heather Piwowar | |
### MIT open license. | |
### blog post about interm results: | |
library(rms) | |
# if true, draw figures on screen instead of in a file | |
SCREEN=TRUE | |
# set to ".eps" or ".png" | |
FORMAT=".pdf" | |
# PLoS standards from http://www.plosone.org/static/figureGuidelines.action#quickref | |
MAX.FIGURE.WIDTH = 6.83 | |
MAX.FIGURE.HEIGHT = 8 | |
setup.figure = function | |
### Set up a graphics object for plotting a journal-ready .eps figure | |
### call this, then do the plot, then call dev.off() | |
( | |
filename, ##<< filename for resulting eps file. Will get .eps or .png or .pdf appended | |
width=6.83, ##<< figure width, in inches | |
height=8, ##<< figure height, in inches | |
format=".eps", ##<< ".eps" or ".png" or ".pdf" | |
screen=FALSE | |
) { | |
if (screen) { | |
quartz() | |
} else { | |
filename.with.extension = paste(filename, format, sep="") | |
if (format==".eps") { | |
postscript(file=filename.with.extension, | |
paper="special", | |
width=width, | |
height=height, | |
family="sans", | |
onefile=FALSE, | |
pagecentre=FALSE, | |
horizontal=FALSE) | |
} else if (format==".png") { | |
png(filename.with.extension) | |
} else if (format==".pdf") { | |
pdf(filename.with.extension) | |
} | |
} | |
return() | |
##examples<< | |
setup.figure("figure1", 6, 8) | |
plot(c(1, 2, 3), c(2, 4, 6)) | |
close.figure("figure1") | |
} | |
close.figure = function | |
### Closes a graphics object and embeds fonts in he associated .eps figure | |
### To be used with setup.figure | |
( | |
filename, ##<< filename of eps file. Will get .eps appended | |
format=".eps", ##<< ".eps" or ".png" | |
screen=FALSE | |
) { | |
if (!screen) { | |
dev.off() | |
if (format==".eps") { | |
filename.with.extension = paste(filename, ".eps", sep="") | |
embedFonts(filename.with.extension, outfile=filename.with.extension) | |
} | |
} | |
return() | |
##examples<< | |
setup.figure("figure1", 6, 8) | |
plot(c(1, 2, 3), c(2, 4, 6)) | |
close.figure("figure1") | |
} | |
read.raw.data = function | |
### Read the raw text data file | |
### change underscores in column names to periods | |
### return as a data frame | |
( | |
filename="rawdata.txt" ##<< name of raw data | |
) | |
{ | |
dat.raw = read.csv(filename, header=TRUE, sep="\t", stringsAsFactors=FALSE) | |
names(dat.raw) = gsub("_", ".", names(dat.raw)) | |
#names.pretty = cat(names(dat.raw), sep="\n") | |
#print names.pretty | |
#save(dat.raw, file="dat_raw.Rdata") | |
#load("dat_raw.Rdata") | |
#dim(dat.raw) | |
#summary(dat.raw) | |
#names(dat.raw) | |
#names(dat.raw)[0:20] | |
return(dat.raw) | |
} | |
# Don't do transformations for this analysis. Use ranks, etc. | |
tr = function(x) return(x) | |
log.tr = function(x) return(x) | |
#### Some of my variables are extracted by looking to see if MEDLINE records have a MeSH term. | |
# This is fine, but sometimes MEDLINE records are incomplete, they are in process or aren't on the list | |
# of journals indexed by MeSH indexers. This means a lack of relevant MeSH term does not mean the MeSH | |
# term does not apply. Thus, must replace these 0s with NAs to indicate the data is indeed missing | |
medline.values = function(raw_values, medline_status) { | |
values = raw_values | |
values[medline_status!="indexed for MEDLINE"] = NA | |
return(values) | |
} | |
preprocess.raw.data = function | |
### Do preprocessing on raw data | |
### change variables to ordered factors, integers, etc as appropriate | |
### return cleaned data as a data frame | |
( | |
dat.raw ##<< data frame containing raw data | |
) | |
{ | |
#### CONVERT STRINGS TO NUMBERS | |
dat.nums = get.dat.nums(dat.raw) ##<<details this produces warnings | |
#library(psych) | |
#described.dat.nums = psych::describe(dat.nums) | |
#write.table(described.dat.nums,"described_dat_nums.txt",append=F,quote=F,sep="\t",row.names=T) | |
#### START TO BUILD STATS DATASET | |
dat = data.frame(pmid = as.numeric(dat.nums$pmid)) | |
# skip issn | |
# skip essn | |
# My naming convention is that "ago" means number of years between 2010 and the original variable | |
dat$years.ago = tr(2010 - dat.nums$pubmed.year.published) | |
# skip date_in_pubmed | |
# skip authors | |
# Break journal into top contenders | |
#dat$journal.bmc.genomics = ifelse("BMC Genomics" == dat.raw$pubmed.journal, 1, 0) | |
#dat$journal.cancer.res = ifelse("Cancer Res" == dat.raw$pubmed.journal, 1, 0) | |
#dat$journal.pnas = ifelse("Proc Natl Acad Sci U S A" == dat.raw$pubmed.journal, 1, 0) | |
#dat$journal.j.biol.chem = ifelse("J Biol Chem" == dat.raw$pubmed.journal, 1, 0) | |
#dat$journal.physiol.genomics = ifelse("Physiol Genomics" == dat.raw$pubmed.journal, 1, 0) | |
#dat$journal.plos.one = ifelse("PLoS One" == dat.raw$pubmed.journal, 1, 0) | |
dat$num.authors = tr(dat.nums$pubmed.number.authors) | |
####### FIRST AUTHOR VARIABLES | |
# skip first_author_first_name | |
dat$first.author.female = ordered(dat.nums$author.first.author.female) | |
dat$first.author.female[which(pmax(dat.nums$author.first.author.male, dat.nums$author.first.author.female) <= 0)] = NA | |
# Removing male because so highly correlated with female | |
#dat$first.author.male = ordered(dat.nums$author.first.author.male) | |
dat$first.author.gender.not.found = ordered(dat.nums$author.first.author.gender.not.found) | |
dat$first.author.num.prev.pubs = tr(dat.nums$first.author.num.prev.pubs) | |
dat$first.author.num.prev.pmc.cites = tr(dat.nums$first.author.num.prev.pmc.cites) | |
dat$first.author.year.first.pub.ago = tr( 2010 - dat.nums$first.author.year.first.pub) | |
# NAs are either because there was no cluster, or the cluster included no prior papers | |
# (unfortunately, I didn't collect an indication of which was true, but in theory there should | |
# be a cluster for all papers published before 2009 Author-ity data was run, so I will | |
# assume it is the latter, that there we no prior papers.) | |
# I will code no prior papers as zero years since the prior paper | |
dat$first.author.year.first.pub.ago[is.na(dat.nums$first.author.year.first.pub)] = tr(0) | |
dat$first.author.num.prev.microarray.creations = tr(dat.nums$first.author.num.prev.microarray.creations) | |
dat$first.author.num.prev.oa = tr(dat.nums$first.author.num.prev.oa) | |
dat$first.author.num.prev.other.sharing = tr( | |
dat.nums$first.author.num.prev.genbank.sharing + | |
dat.nums$first.author.num.prev.pdb.sharing + | |
dat.nums$first.author.num.prev.swissprot.sharing) | |
dat$first.author.num.prev.geoae.sharing = tr(dat.nums$first.author.num.prev.geoae.sharing) | |
dat$first.author.num.prev.geo.reuse = tr(dat.nums$first.author.num.prev.geo.reuse) | |
dat$first.author.num.prev.multi.center = tr(dat.nums$first.author.num.prev.multi.center) | |
# skip first.author.prev.meta.analysis because not enough > 0 | |
# set to NA if the number of publications is unrealistically big, because this probably means that | |
# Author-ity combined two or more authors together. Better to set to NA and deal with bias than introduce | |
# such large numbers into our data | |
first.author.thresh.cluster.size = quantile(dat$first.author.num.prev.pubs, .98, na.rm=TRUE) | |
first.author.unrealistic.pub.clusters = which(dat$first.author.num.prev.pubs > first.author.thresh.cluster.size) | |
dat$first.author.num.prev.pubs[first.author.unrealistic.pub.clusters] = NA | |
dat$first.author.num.prev.pmc.cites[first.author.unrealistic.pub.clusters] = NA | |
dat$first.author.year.first.pub.ago[first.author.unrealistic.pub.clusters] = NA | |
dat$first.author.num.prev.microarray.creations[first.author.unrealistic.pub.clusters] = NA | |
dat$first.author.num.prev.oa[first.author.unrealistic.pub.clusters] = NA | |
dat$first.author.num.prev.other.sharing [first.author.unrealistic.pub.clusters] = NA | |
dat$first.author.num.prev.geoae.sharing[first.author.unrealistic.pub.clusters] = NA | |
dat$first.author.num.prev.geo.reuse [first.author.unrealistic.pub.clusters] = NA | |
dat$first.author.num.prev.multi.center[first.author.unrealistic.pub.clusters] = NA | |
####### LAST AUTHOR VARIABLES | |
# skip last_author_first_name | |
dat$last.author.female = ordered(dat.nums$last.first.author.female) | |
dat$last.author.female[which(pmax(dat.nums$last.first.author.male, dat.nums$last.first.author.female) <= 0)] = NA | |
# Removing male because so highly correlated with female | |
#dat$last.author.male = ordered(dat.nums$last.first.author.male) | |
dat$last.author.gender.not.found = ordered(dat.nums$last.first.author.gender.not.found) | |
dat$last.author.num.prev.pubs = tr(dat.nums$last.author.num.prev.pubs) | |
dat$last.author.num.prev.pmc.cites = tr(dat.nums$last.author.num.prev.pmc.cites) | |
dat$last.author.year.first.pub.ago = tr(2010 - dat.nums$last.author.year.first.pub) | |
# See comment on first author above for treatment of NAs for this variable | |
dat$last.author.year.first.pub.ago[is.na(dat.nums$last.author.year.first.pub)] = tr(0) | |
dat$last.author.num.prev.microarray.creations = tr(dat.nums$last.author.num.prev.microarray.creations) | |
dat$last.author.num.prev.oa = tr(dat.nums$last.author.num.prev.oa) | |
dat$last.author.num.prev.other.sharing = tr( | |
dat.nums$last.author.num.prev.genbank.sharing + | |
dat.nums$last.author.num.prev.pdb.sharing + | |
dat.nums$last.author.num.prev.swissprot.sharing ) | |
dat$last.author.num.prev.geoae.sharing = tr(dat.nums$last.author.num.prev.geoae.sharing) | |
dat$last.author.num.prev.geo.reuse = tr(dat.nums$last.author.num.prev.geo.reuse) | |
dat$last.author.num.prev.multi.center = tr(dat.nums$last.author.num.prev.multi.center) | |
# skip last.author.prev.meta.analysis because not enough > 0 | |
# set to NA if the number of publications is unrealistically big, because this probably means that | |
# Author-ity combined two or more authors together. Better to set to NA and deal with bias than introduce | |
# such large numbers into our data | |
last.author.thresh.cluster.size = quantile(dat$last.author.num.prev.pubs, .98, na.rm=TRUE) | |
last.author.unrealistic.pub.clusters = which(dat$last.author.num.prev.pubs >= last.author.thresh.cluster.size) | |
dat$last.author.num.prev.pubs[last.author.unrealistic.pub.clusters] = NA | |
dat$last.author.num.prev.pmc.cites[last.author.unrealistic.pub.clusters] = NA | |
dat$last.author.year.first.pub.ago[last.author.unrealistic.pub.clusters] = NA | |
dat$last.author.num.prev.microarray.creations[last.author.unrealistic.pub.clusters] = NA | |
dat$last.author.num.prev.oa[last.author.unrealistic.pub.clusters] = NA | |
dat$last.author.num.prev.other.sharing [last.author.unrealistic.pub.clusters] = NA | |
dat$last.author.num.prev.geoae.sharing[last.author.unrealistic.pub.clusters] = NA | |
dat$last.author.num.prev.geo.reuse [last.author.unrealistic.pub.clusters] = NA | |
dat$last.author.num.prev.multi.center[last.author.unrealistic.pub.clusters] = NA | |
#### SOME FUNDING VARIABLES AND STUDY VARIABLES | |
# skip address | |
# skip institution, use instition_clean instead below | |
# skip country, use country_clean instead below | |
# skip grant_numbers, use lots of other related variables below | |
dat$num.grant.numbers = tr(dat.nums$num.grant.numbers) | |
dat$num.nih.is.nci = tr(dat.nums$nih.is.nci) | |
dat$num.nih.is.nhlbi = tr(dat.nums$nih.is.nhlbi) | |
dat$num.nih.is.ncrr = tr(dat.nums$nih.is.ncrr) | |
dat$num.nih.is.niehs = tr(dat.nums$nih.is.niehs) | |
dat$num.nih.is.ninds = tr(dat.nums$nih.is.ninds) | |
dat$num.nih.is.niddk = tr(dat.nums$nih.is.niddk) | |
dat$num.nih.is.nigms = tr(dat.nums$nih.is.nigms) | |
dat$num.nih.is.niaid = tr(dat.nums$nih.is.niaid) | |
# skip is_review because all 0s, as it should be | |
# need to use pubmed.medline.status to figure out what should really be NA because of incomplete mesh terms | |
dat$pubmed.is.humans = ordered(medline.values(dat.nums$pubmed.is.humans, dat.raw$pubmed.medline.status)) | |
dat$pubmed.is.animals = ordered(medline.values(dat.nums$pubmed.is.animals, dat.raw$pubmed.medline.status)) | |
dat$pubmed.is.mice = ordered(medline.values(dat.nums$pubmed.is.mice, dat.raw$pubmed.medline.status)) | |
dat$pubmed.is.fungi = ordered(medline.values(dat.nums$pubmed.is.fungi, dat.raw$pubmed.medline.status)) | |
dat$pubmed.is.bacteria = ordered(medline.values(dat.nums$pubmed.is.bacteria, dat.raw$pubmed.medline.status)) | |
dat$pubmed.is.plants = ordered(medline.values(dat.nums$pubmed.is.plants, dat.raw$pubmed.medline.status)) | |
dat$pubmed.is.viruses = ordered(medline.values(dat.nums$pubmed.is.viruses, dat.raw$pubmed.medline.status)) | |
dat$pubmed.is.cultured.cells = ordered(medline.values(dat.nums$pubmed.is.cultured.cells, dat.raw$pubmed.medline.status)) | |
# is cancer appears to depend not entirely on MEDLINE completeness | |
dat$pubmed.is.cancer = ordered(dat.nums$pubmed.is.cancer) | |
# Open access filter, and some others below, are by journal not article, so doesn't depend on MEDLINE completion status | |
dat$pubmed.is.open.access = ordered(dat.nums$pubmed.is.open.access) | |
dat$pubmed.is.effectiveness = ordered(medline.values(dat.nums$pubmed.is.effectiveness, dat.raw$pubmed.medline.status)) | |
dat$pubmed.is.diagnosis = ordered(medline.values(dat.nums$pubmed.is.diagnosis, dat.raw$pubmed.medline.status)) | |
dat$pubmed.is.prognosis = ordered(medline.values(dat.nums$pubmed.is.prognosis, dat.raw$pubmed.medline.status)) | |
dat$pubmed.is.core.clinical.journal = ordered(dat.nums$pubmed.is.core.clinical.journal) | |
# skip is_clinical_trial because not enough | |
# skip is_randomized_clinical_trial because not enough | |
# skip is_meta_analysis because none (yay!) | |
dat$pubmed.is.comparative.study = ordered(medline.values(dat.nums$pubmed.is.comparative.study, dat.raw$pubmed.medline.status)) | |
# skip multicenter because not enough (surprising?) | |
# skip validation because not enough (surprising?) | |
# skip funded_stimulus because all 0s | |
# external funding seems set sometimes even for 1s, so doesn't depend on MEDLINE completely | |
# commenting out external in favour of a combined nih below | |
#dat$pubmed.is.funded.nih.extramural = ordered(dat.nums$pubmed.is.funded.nih.extramural) | |
dat$pubmed.is.funded.nih.intramural = ordered(dat.nums$pubmed.is.funded.nih.intramural) | |
# Combining extramural, intramural, and whether the PMID had a link to funding in the NIH data sources | |
dat$pubmed.is.funded.nih = ordered(pmax(dat.nums$pubmed.is.funded.nih.extramural, dat.nums$pubmed.is.funded.nih.intramural, !is.na(dat.nums$num.grants))) | |
dat$pubmed.is.funded.non.us.govt = ordered(dat.nums$pubmed.is.funded.non.us.govt) | |
# Sharing in PDB and Swissprot very small, so will combine | |
dat$pubmed.is.shared.other = ordered(pmax(dat.nums$pubmed.in.genbank, dat.nums$pubmed.in.pdb, dat.nums$pubmed.in.swissprot)) | |
# geo.reuse is tiny but I want to use it anyway! | |
dat$pubmed.is.geo.reuse = ordered(dat.nums$is.geo.reuse) | |
dat$pubmed.num.cites.from.pmc = tr(dat.nums$pubmed.number.times.cited.in.pmc) | |
dat$pubmed.num.cites.from.pmc.per.year = dat.nums$pubmed.number.times.cited.in.pmc/(2010 - dat.nums$pubmed.year.published) | |
# Comment these out for now, because they look wrong | |
#dat$found.by.highwire = ordered(dat.nums$found.by.highwire) | |
#dat$found.by.scirus = ordered(dat.nums$found.by.highwire) | |
#dat$found.by.googlescholar = ordered(dat.nums$found.by.highwire) | |
#dat$found.by.pmc = ordered(dat.nums$portal.pmids.found.by.pmc) | |
##### DEPENDENT VARIABLES | |
dat$dataset.in.geo = ordered(dat.nums$pubmed.in.geo) | |
# commenting this out in favour of a combined metric below | |
#dat$dataset.in.arrayexpress = ordered(dat.nums$in.arrayexpress) | |
dat$dataset.in.geo.or.ae = ordered(dat.nums$in.ae.or.geo) | |
###### INSTITUTION | |
dat$country.australia = ordered(ifelse("Australia" == dat.raw$country.clean, 1, 0)) | |
dat$country.canada = ordered(ifelse("Canada" == dat.raw$country.clean, 1, 0)) | |
dat$country.china = ordered(ifelse("China" == dat.raw$country.clean, 1, 0)) | |
dat$country.france = ordered(ifelse("France" == dat.raw$country.clean, 1, 0)) | |
dat$country.germany = ordered(ifelse("Germany" == dat.raw$country.clean, 1, 0)) | |
dat$country.japan = ordered(ifelse("Japan" == dat.raw$country.clean, 1, 0)) | |
dat$country.korea = ordered(ifelse("Korea" == dat.raw$country.clean, 1, 0)) | |
dat$country.uk = ordered(ifelse("UK" == dat.raw$country.clean, 1, 0)) | |
dat$country.usa = ordered(ifelse("USA" == dat.raw$country.clean, 1, 0)) | |
dat$institution.is.medical = ordered(dat.nums$institution.hospital.or.medlcal) | |
dat$institution.rank = dat.nums$institution.rank | |
dat$institution.is.higher.ed = ordered(ifelse("Higher educ." == dat.raw$institution.sector, 1, 0)) | |
dat$institution.is.higher.ed[which(is.na(dat$institution.rank))] = NA | |
dat$institution.is.govnt = ordered(ifelse("Government" == dat.raw$institution.sector, 1, 0)) | |
dat$institution.is.govnt[which(is.na(dat$institution.rank))] = NA | |
dat$institution.research.output = tr(dat.nums$institution.output) | |
dat$institution.international.collaboration = dat.nums$international.collaboration | |
dat$institution.mean.norm.impact.factor = dat.nums$institution.norm.sjr | |
dat$institution.mean.norm.citation.score = dat.nums$institution.norm.citation.score | |
# Include variables for the three most common institutions | |
#top.institutions = names(sort(table(dat.institutions$institutions.factor), decreasing=TRUE)[0:25]) | |
#display.institutions = c(top.institutions) | |
dat$institution.harvard = ordered(ifelse("Harvard University" == dat.raw$institution.clean, 1, 0)) | |
dat$institution.nci = ordered(ifelse("National Cancer Institute" == dat.raw$institution.clean, 1, 0)) | |
dat$institution.stanford = ordered(ifelse("Stanford University" == dat.raw$institution.clean, 1, 0)) | |
### JOURNAL VARIABLES | |
# skip total cites 2008 because it could be due to so many things, not very informative | |
#dat$journal.cites.2008 = tr(dat.nums$journal.2008.cites) | |
dat$journal.impact.factor = log.tr(dat.nums$journal.impact.factor) | |
# PLoS One is a big part of our sample, and doesn't have an official impact factor yet. | |
# Important, because it is relatively low, so better to give it our estimate instead of leave missing | |
# Estimate of 3 comes from http://pbeltrao.blogspot.com/2009/04/guestimating-plos-one-impact-factor.html | |
dat$journal.impact.factor[which(dat.raw$pubmed.journal== "PLoS One")] = log.tr(3) | |
dat$journal.5yr.impact.factor = log.tr(dat.nums$journal.5yr.impact.factor) | |
dat$journal.immediacy.index = log.tr(dat.nums$journal.immediacy.index) | |
dat$journal.num.articles.2008 = tr(dat.nums$journal.num.articles.2008) | |
dat$journal.cited.halflife = dat.nums$journal.cited.halflife | |
# skip journal_policy_try because it is redundant | |
dat$journal.policy.requires.microarray.accession = ordered(ifelse(dat.nums$journal.policy.requires.microarray.accession > 0, 1, 0)) | |
# says must deposit should be true whenever requires microarray accession is tru | |
dat$journal.policy.says.must.deposit = ordered(ifelse(pmax(dat.nums$journal.policy.says.must.deposit, dat.nums$journal.policy.requires.microarray.accession) > 0, 1, 0)) | |
dat$journal.policy.at.least.requests.sharing.microarray = ordered(ifelse(dat.nums$journal.policy.at.least.requests.sharing.microarray > 0, 1, 0)) | |
dat$journal.policy.mentions.any.sharing = ordered(ifelse(dat.nums$journal.policy.mentions.any.sharing > 0, 1, 0)) | |
dat$journal.policy.general.statement = ordered(ifelse(dat.nums$journal.policy.general.statement > 0, 1, 0)) | |
# requests accession should be true whenenever requires accession is true, but looks like I didn't code it that way | |
dat$journal.policy.requests.accession = ordered(ifelse(pmax(dat.nums$journal.policy.requests.accession, dat.nums$journal.policy.requires.microarray.accession) > 0, 1, 0)) | |
dat$journal.policy.mentions.exceptions = ordered(ifelse(dat.nums$journal.policy.mentions.exceptions > 0, 1, 0)) | |
dat$journal.policy.mentions.consequences = ordered(ifelse(dat.nums$journal.policy.mentions.consequences > 0, 1, 0)) | |
dat$journal.policy.contains.word.microarray = ordered(ifelse(dat.nums$journal.policy.contains.word.microarray > 0, 1, 0)) | |
dat$journal.policy.contains.word.miame.mged = ordered(ifelse(dat.nums$journal.policy.contains.word.MIAME.MGED > 0, 1, 0)) | |
dat$journal.policy.contains.word.arrayexpress = ordered(ifelse(dat.nums$journal.policy.contains.word.arrayexpress > 0, 1, 0)) | |
dat$journal.policy.contains.word.geo.omnibus = ordered(ifelse(dat.nums$journal.policy.contains.word.geo.omnibus > 0, 1, 0)) | |
dat$journal.policy.requests.sharing.other.data = ordered(ifelse(dat.nums$journal.policy.requests.sharing.other.data > 0, 1, 0)) | |
# For some reason this variable didn't have NAs, but it should have the same NAs as the other journal policy vars | |
dat$journal.policy.requests.sharing.other.data[is.na(dat$journal.policy.requests.accession)] = NA | |
# Add a variable saying how many microarray creating (as defined by being in this set) | |
# papers the journal has published | |
table.journal.counts = table(dat.raw$pubmed.journal) | |
dat$journal.microarray.creating.count = tr(as.numeric(table.journal.counts[dat.raw$pubmed.journal])) | |
#### MORE FUNDING VARIABLES | |
# set the missing values to 0 or tr(0), as appropriate | |
# skip pmid:1 as duplicate column | |
dat$num.grants.via.nih = tr(dat.nums$num.grants) | |
dat$num.grants.via.nih[is.na(dat$num.grants.via.nih)] = tr(0) | |
dat$max.grant.duration = tr(dat.nums$longest.num.years) | |
dat$max.grant.duration[is.na(dat$max.grant.duration)] = tr(0) | |
dat$nih.first.year.ago = tr(2010 - dat.nums$first.year) | |
dat$nih.first.year.ago[is.na(dat$nih.first.year.ago)] = NA | |
dat$nih.last.year.ago = tr(2010 - dat.nums$last.year) | |
dat$nih.last.year.ago[is.na(dat$nih.last.year.ago)] = NA | |
dat$nih.cumulative.years = tr(dat.nums$cumulative.years) | |
dat$nih.cumulative.years[is.na(dat$nih.cumulative.years)] = tr(0) | |
dat$nih.sum.avg.dollars = tr(dat.nums$sum.avg.dollars) | |
dat$nih.sum.avg.dollars[is.na(dat$nih.sum.avg.dollars)] = tr(0) | |
dat$nih.sum.sum.dollars = tr(dat.nums$sum.sum.dollars) | |
dat$nih.sum.sum.dollars[is.na(dat$nih.sum.sum.dollars)] = tr(0) | |
dat$nih.max.max.dollars = tr(dat.nums$max.max.dollars) | |
dat$nih.max.max.dollars[is.na(dat$nih.max.max.dollars)] = tr(0) | |
# skip | |
# skip grant_activity_codes list, because captured below | |
dat$has.R01.funding = ordered(dat.nums$has.R01.funding) | |
dat$has.R01.funding[is.na(dat$has.R01.funding)] = 0 | |
#dat$has.T32.funding = ordered(dat.nums$has.T32.funding) | |
#dat$has.T32.funding[is.na(dat$has.T32.funding)] = 0 | |
# other funding types dropped because don't occurr often enough | |
# look for patterns here | |
dat$has.P.funding = ordered(ifelse(grepl("P", dat.raw$grant.activity.codes), 1, 0)) | |
dat$has.R.funding = ordered(ifelse(grepl("R", dat.raw$grant.activity.codes), 1, 0)) | |
dat$has.T.funding = ordered(ifelse(grepl("T", dat.raw$grant.activity.codes), 1, 0)) | |
dat$has.U.funding = ordered(ifelse(grepl("U", dat.raw$grant.activity.codes), 1, 0)) | |
dat$has.K.funding = ordered(ifelse(grepl("K", dat.raw$grant.activity.codes), 1, 0)) | |
# see below this block for handling of their NAs | |
# only pick one funding level, otherwise too singular | |
# not enough post2007 | |
dat$num.post2003.morethan500k = tr(dat.nums$num.post2003.morethan500k) | |
dat$num.post2004.morethan500k = tr(dat.nums$num.post2004.morethan500k) | |
dat$num.post2005.morethan500k = tr(dat.nums$num.post2005.morethan500k) | |
dat$num.post2006.morethan500k = tr(dat.nums$num.post2006.morethan500k) | |
#dat$num.post2007.morethan500k = tr(dat.nums$num.post2007.morethan500k) | |
dat$num.post2003.morethan750k = tr(dat.nums$num.post2003.morethan750k) | |
dat$num.post2004.morethan750k = tr(dat.nums$num.post2004.morethan750k) | |
dat$num.post2005.morethan750k = tr(dat.nums$num.post2005.morethan750k) | |
dat$num.post2006.morethan750k = tr(dat.nums$num.post2006.morethan750k) | |
#dat$num.post2007.morethan750k = tr(dat.nums$num.post2007.morethan750k) | |
dat$num.post2003.morethan1000k = tr(dat.nums$num.post2003.morethan1000k) | |
dat$num.post2004.morethan1000k = tr(dat.nums$num.post2004.morethan1000k) | |
dat$num.post2005.morethan1000k = tr(dat.nums$num.post2005.morethan1000k) | |
dat$num.post2006.morethan1000k = tr(dat.nums$num.post2006.morethan1000k) | |
#dat$num.post2007.morethan1000k = tr(dat.nums$num.post2007.morethan1000k) | |
dat$num.post2003.morethan500k[is.na(dat$num.post2003.morethan500k)] = tr(0) | |
dat$num.post2004.morethan500k[is.na(dat$num.post2004.morethan500k)] = tr(0) | |
dat$num.post2005.morethan500k[is.na(dat$num.post2005.morethan500k)] = tr(0) | |
dat$num.post2006.morethan500k[is.na(dat$num.post2006.morethan500k)] = tr(0) | |
#dat$num.post2007.morethan500k[is.na(dat$num.post2007.morethan500k)] = tr(0) | |
dat$num.post2003.morethan750k[is.na(dat$num.post2003.morethan750k)] = tr(0) | |
dat$num.post2004.morethan750k[is.na(dat$num.post2004.morethan750k)] = tr(0) | |
dat$num.post2005.morethan750k[is.na(dat$num.post2005.morethan750k)] = tr(0) | |
dat$num.post2006.morethan750k[is.na(dat$num.post2006.morethan750k)] = tr(0) | |
#dat$num.post2007.morethan750k[is.na(dat$num.post2007.morethan750k)] = tr(0) | |
dat$num.post2003.morethan1000k[is.na(dat$num.post2003.morethan1000k)] = tr(0) | |
dat$num.post2004.morethan1000k[is.na(dat$num.post2004.morethan1000k)] = tr(0) | |
dat$num.post2005.morethan1000k[is.na(dat$num.post2005.morethan1000k)] = tr(0) | |
dat$num.post2006.morethan1000k[is.na(dat$num.post2006.morethan1000k)] = tr(0) | |
#dat$num.post2007.morethan1000k[is.na(dat$num.post2007.morethan1000k)] = tr(0) | |
dat$dataset.in.geo.or.ae.int = dat.nums$in.ae.or.geo | |
#dim(dat) | |
#save(dat, file="dat.Rdata") | |
return(dat) | |
} | |
explore.cleaned.data = function | |
( | |
dat | |
) | |
{ | |
library(Hmisc) | |
library(psych) | |
# both Hmisc and psych have a describe | |
Hmisc::describe(dat) | |
#psych::describe(dat) | |
s = summary(dataset.in.geo.or.ae.int ~ ., dat) | |
s | |
} | |
display.attributes = c( | |
"num.authors" | |
,"first.author.female" | |
,"first.author.gender.not.found" | |
,"first.author.num.prev.pubs" | |
,"first.author.num.prev.pmc.cites" | |
,"first.author.year.first.pub.ago" | |
,"first.author.num.prev.microarray.creations" | |
,"first.author.num.prev.oa" | |
,"first.author.num.prev.other.sharing" | |
,"first.author.num.prev.geo.reuse" | |
,"first.author.num.prev.multi.center" | |
,"last.author.female" | |
,"last.author.gender.not.found" | |
,"last.author.num.prev.pubs" | |
,"last.author.num.prev.pmc.cites" | |
,"last.author.year.first.pub.ago" | |
,"last.author.num.prev.microarray.creations" | |
,"last.author.num.prev.oa" | |
,"last.author.num.prev.other.sharing" | |
,"last.author.num.prev.geo.reuse" | |
,"last.author.num.prev.multi.center" | |
,"num.grant.numbers" | |
,"num.nih.is.nci" | |
,"num.nih.is.nhlbi" | |
,"num.nih.is.ncrr" | |
,"num.nih.is.niehs" | |
,"num.nih.is.ninds" | |
,"num.nih.is.niddk" | |
,"num.nih.is.nigms" | |
,"num.nih.is.niaid" | |
,"pubmed.is.humans" | |
,"pubmed.is.animals" | |
,"pubmed.is.mice" | |
,"pubmed.is.fungi" | |
,"pubmed.is.bacteria" | |
,"pubmed.is.plants" | |
,"pubmed.is.viruses" | |
,"pubmed.is.cultured.cells" | |
,"pubmed.is.cancer" | |
,"pubmed.is.open.access" | |
,"pubmed.is.effectiveness" | |
,"pubmed.is.diagnosis" | |
,"pubmed.is.prognosis" | |
,"pubmed.is.core.clinical.journal" | |
,"pubmed.is.comparative.study" | |
,"pubmed.is.funded.nih.intramural" | |
,"pubmed.is.funded.nih" | |
,"pubmed.is.funded.non.us.govt" | |
,"pubmed.is.shared.other" | |
,"pubmed.num.cites.from.pmc" | |
,"pubmed.num.cites.from.pmc.per.year" | |
,"dataset.in.geo" | |
,"dataset.in.geo.or.ae" | |
,"country.australia" | |
,"country.canada" | |
,"country.china" | |
,"country.france" | |
,"country.germany" | |
,"country.japan" | |
,"country.korea" | |
,"country.uk" | |
,"country.usa" | |
,"institution.is.medical" | |
,"institution.rank" | |
,"institution.is.higher.ed" | |
,"institution.is.govnt" | |
,"institution.research.output" | |
,"institution.international.collaboration" | |
,"institution.mean.norm.impact.factor" | |
,"institution.mean.norm.citation.score" | |
,"journal.impact.factor" | |
,"journal.num.articles.2008" | |
,"journal.policy.requires.microarray.accession" | |
,"journal.policy.general.statement" | |
,"journal.policy.requests.accession" | |
,"journal.policy.mentions.exceptions" | |
,"journal.policy.mentions.consequences" | |
,"journal.policy.contains.word.miame.mged" | |
,"journal.policy.contains.word.arrayexpress" | |
,"journal.policy.contains.word.geo.omnibus" | |
,"journal.policy.requests.sharing.other.data" | |
,"journal.microarray.creating.count" | |
,"num.grants.via.nih" | |
,"max.grant.duration" | |
,"nih.first.year.ago" | |
,"nih.last.year.ago" | |
,"nih.cumulative.years" | |
,"nih.sum.avg.dollars" | |
,"nih.sum.sum.dollars" | |
,"nih.max.max.dollars" | |
,"has.R01.funding" | |
,"has.P.funding" | |
,"has.R.funding" | |
,"has.T.funding" | |
,"has.U.funding" | |
,"has.K.funding" | |
,"num.post2004.morethan1000k" | |
,"dataset.in.geo.or.ae.int" | |
) | |
make.figure.S1toS5 = function | |
( | |
dat.untransformed, | |
screen=SCREEN, | |
format=FORMAT | |
) | |
{ | |
label(dat.untransformed$first.author.num.prev.microarray.creations) <- "first.author.num.prev.micro.produce" | |
label(dat.untransformed$last.author.num.prev.microarray.creations) <- "last.author.num.prev.micro.produce" | |
label(dat.untransformed$journal.policy.at.least.requests.sharing.microarray) <- "journal.policy.requests.or.requires.micro.data" | |
label(dat.untransformed$journal.policy.contains.word.geo.omnibus) <- "journal.policy.contains.word.geo.omnibus" | |
# to see formatting, try ?dotchart2 or ?plot.summary.formula.response | |
mynames = names(dat.untransformed) | |
sections = 5 | |
section.length = ceil(length(mynames) / sections) | |
#for (ii in 0:0) { | |
for (ii in 0:(sections-1)) { | |
## Check to make sure this is an even divide do don't miss any! | |
#first.count = ii*section.length | |
#last.count = min(length(mynames), ((ii+1)*section.length)-1) | |
#mynames.section = mynames[first.count:last.count] | |
# group them by topic instead | |
variable.groups = c("^num.authors|^first|^last", | |
"journal", | |
"pubmed", | |
"nih|grant|num.post|funding", | |
"country|institution") | |
mynames.section = mynames[grep(variable.groups[ii+1], mynames)] | |
mynames.section = intersect(mynames.section, display.attributes) | |
if ("dataset.in.geo.or.ae.int" %nin% mynames.section) { | |
mynames.section = c(mynames.section, "dataset.in.geo.or.ae.int") | |
} | |
print(ii) | |
print(mynames.section) | |
s = summary(dataset.in.geo.or.ae.int ~ ., dat.untransformed[,names(dat.untransformed) %in% mynames.section], continuous = 3) | |
#s | |
#quartz() | |
filename = paste("figureS", ii+1, sep="") | |
print(filename) | |
setup.figure(filename, MAX.FIGURE.WIDTH, MAX.FIGURE.HEIGHT, screen=screen, format=format) | |
par(mai=c(1,5,0.2,1)) # bottom, left, top, right | |
response = try( | |
plot.summary.formula.response.CIs(s, width.factor=2, cex.labels=0.4, cex=0.9, xlim=c(0,1), xlab="Proportion of studies with datasets\nfound in GEO or ArrayExpress", main="") | |
, TRUE) | |
if (class(response)=="try-error") { | |
plot(s, width.factor=2, cex.labels=0.4, cex=0.9, xlim=c(0,1), xlab="Proportion of studies with datasets\nfound in GEO or ArrayExpress", main="") | |
} | |
close.figure(filename, screen=screen, format=format) | |
} | |
} | |
get.dat.nums = function | |
( dat.raw ) | |
{ | |
library(plyr) | |
dat.nums = colwise(as.numeric)(dat.raw) ##<<details this produces warnings | |
return(dat.nums) | |
} | |
make.figure.journals = function | |
( | |
dat, | |
n.journals = 25, | |
screen=SCREEN, | |
format=FORMAT | |
) | |
{ | |
dat.journals = dat | |
dat.journals$dataset.in.geo.or.ae.int = as.numeric(dat.journals$in.ae.or.geo) | |
# Consolidate the less frequent journals into Other | |
top.journals = names(sort(table(dat.journals$pubmed.journal), decreasing=TRUE)[0:n.journals]) | |
dat.journals$journal.factor = factor(dat.journals$pubmed.journal) | |
levels(dat.journals$journal.factor)[levels(dat.journals$journal.factor) %nin% top.journals] <- "OTHER" | |
s = summary(dataset.in.geo.or.ae.int ~ journal.factor, dat.journals) | |
s | |
setup.figure("journals", MAX.FIGURE.WIDTH, MAX.FIGURE.WIDTH, screen=screen, format=format) | |
par(mai=c(1,5,0.2,1)) # bottom, left, top, right | |
plot.summary.formula.response.CIs(s, width.factor=2, cex.labels=0.5, cex=0.9, xlim=c(0,1), xlab="Proportion of studies with datasets\nfound in GEO or ArrayExpress", main="") | |
close.figure("journals", screen=screen, format=format) | |
ps.chisq = c() | |
for (var in top.journals) { | |
pv1 <- chisq.test(table(dat.journals$dataset.in.geo.or.ae, dat.journals$journal.factor==var))$p.value | |
ps.chisq = cbind(ps.chisq, pv1) | |
} | |
colnames(ps.chisq) = top.journals | |
adjusted.ps.chisq = p.adjust(ps.chisq, method = c("none")) | |
colnames(adjusted.ps.chisq) = colnames(ps.chisq) | |
print(t(rbind(colnames(adjusted.ps.chisq)[order(adjusted.ps.chisq)], adjusted.ps.chisq[order(adjusted.ps.chisq)]))) | |
} | |
### RUN IT ALL! | |
setwd("~/Documents/Projects/ASIST2011/paper") | |
## The source file below can be found at https://gist.github.com/999036 | |
source("plot.summary.formula.response.CIs.R") | |
## Dataset was downloaded from http://dx.doi.org/10.5061/dryad.mf1sd/1. | |
## Please cite use of this data in the references section of publications as | |
## Piwowar HA (2011) Data from: Who shares? Who doesn’t? Factors associated with openly archiving raw research data. Dryad Digital Repository. doi:10.5061/dryad.mf1sd | |
raw.data.filename = "rawdata.txt" | |
dat.raw.all = read.raw.data(raw.data.filename) | |
##Keep just the data that has journal policy information | |
dat.raw.policycomplete = dat.raw.all | |
dat.raw.policycomplete = dat.raw.policycomplete[!is.na(dat.raw.policycomplete$journal.policy.says.must.deposit),] | |
dat.raw.policycomplete = dat.raw.policycomplete[(dat.raw.policycomplete$pubmed.year.published<2010),] | |
##Keep just the studies published in journals that require deposition | |
dat.raw.policymustdeposit = dat.raw.policycomplete[(dat.raw.policycomplete$journal.policy.says.must.deposit>0),] | |
##Keep just the studies where neither the first nor last author have shared before (by my estimates) | |
dat.raw.policymustdeposit.nevershared = dat.raw.policymustdeposit[intersect( | |
which(dat.raw.policymustdeposit$first.author.num.prev.geoae.sharing==0), | |
which(dat.raw.policymustdeposit$last.author.num.prev.geoae.sharing==0)),] | |
##Keep just the recent studies, published from 2007 through 2009 | |
dat.raw.policymustdeposit.nevershared.recent = subset(dat.raw.policymustdeposit.nevershared, subset=pubmed.year.published>=2007) | |
dat.raw = dat.raw.policymustdeposit.nevershared.recent | |
##Process the data | |
dat.clean = preprocess.raw.data(dat.raw) | |
dat.clean = subset(dat.clean, select = -first.author.num.prev.geoae.sharing) | |
dat.clean = subset(dat.clean, select = -last.author.num.prev.geoae.sharing) | |
##Display the results | |
options(digits=4) | |
options(width=100) | |
explore.cleaned.data(dat.clean) | |
make.figure.S1toS5(dat.clean, screen=TRUE, format=FORMAT) | |
make.figure.journals(dat.raw, 25, screen=TRUE, format=FORMAT) | |
#### Look at which categorical variables have significant differences | |
dat = dat.clean | |
ps.chisq = c() | |
for (var in names(dat)) { | |
pv1 <- chisq.test(table(dat[,"dataset.in.geo.or.ae.int"], dat[,var]))$p.value | |
ps.chisq = cbind(ps.chisq, pv1) | |
} | |
colnames(ps.chisq) = names(dat) | |
# Review the type of p-value multiple comparison adjustment used below before interpreting! | |
adjusted.ps.chisq = p.adjust(ps.chisq, method = c("none")) | |
colnames(adjusted.ps.chisq) = colnames(ps.chisq) | |
varnames = colnames(adjusted.ps.chisq) | |
ordered.results = order(adjusted.ps.chisq) | |
print("chi squared p-values, so only look at the results for category variables (review multiple comparison adjustment used below before interpreting!):") | |
print(t(rbind(varnames[ordered.results], adjusted.ps.chisq[ordered.results]))) | |
#### Look at which continuous variables have significant differences | |
ps.wilcox = c() | |
for (var in names(dat)) { | |
pv1 = NA | |
try(pv1 <- wilcox.test(dat[,var] ~ dat[,"dataset.in.geo.or.ae.int"])$p.value, TRUE) | |
ps.wilcox = cbind(ps.wilcox, pv1) | |
} | |
colnames(ps.wilcox) = names(dat) | |
# Review the type of p-value multiple comparison adjustment used below before interpreting! | |
adjusted.ps.wilcox = p.adjust(ps.wilcox, method = c("none")) | |
colnames(adjusted.ps.wilcox) = colnames(ps.wilcox) | |
varnames = colnames(adjusted.ps.wilcox) | |
ordered.results = order(adjusted.ps.wilcox) | |
print("wilcox p-values, so only look at the results for continuous variables (review multiple comparison adjustment used below before interpreting!):") | |
print(t(rbind(varnames[ordered.results], adjusted.ps.wilcox[ordered.results]))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment