Created
January 16, 2015 04:43
-
-
Save jtleek/c5158965d77c21ade424 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Load libraries | |
library(XML) | |
library(dplyr) | |
library(RCurl) | |
## Get the results for a specific term | |
scrape_term = function(search_term,npages){ | |
base_url = "http://scholar.google.com/scholar?" | |
search_string = paste0("q=",paste0(strsplit(search_term," ")[[1]],collapse="+")) | |
dat = data.frame(NA,nrow=10*npages,ncol=3) | |
names(dat)=c("pub_year","cites","title") | |
for(i in 1:npages){ | |
if(i==1){ | |
url1 = paste0(base_url,search_string) | |
}else{ | |
start_string = paste0("&start=",(i-1)*10) | |
url1 = paste0(base_url,search_string,start_string) | |
} | |
doc <- htmlParse(url1,encoding="UTF-8") | |
titles <- xpathSApply(doc, "//h3[@class='gs_rt']", xmlValue) | |
cites = xpathSApply(doc, | |
'//*[contains(concat( " ", @class, " " ), concat( " ", "gs_ri", " " ))]//*[contains(concat( " ", @class, " " ), concat( " ", "gs_fl", " " ))]//a[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]', | |
xmlValue) | |
cites = cites[1:10*3-2] | |
cites = as.numeric(sapply(cites,function(x){strsplit(x,"Cited by ")[[1]][2]})) | |
pub <- xpathSApply(doc, "//div[@class='gs_a']", xmlValue) | |
pub_years = as.integer(gsub(".*\\s(\\d{4})\\s.*", "\\1", pub)) | |
ind = ((i-1)*10+1):(i*10) | |
dat[ind,1] = pub_years | |
dat[ind,2] = cites | |
dat[ind,3] = titles | |
} | |
return(dat) | |
} | |
## Search for these terms | |
terms = c('empirical processes','proportional hazards model','generalized linear model','semiparametric','generalized estimating equation','false discovery rate','microarray statistics','lasso shrinkage','rna-seq statistics') | |
nterms=length(terms) | |
term_data = vector(mode="list",length=nterms) | |
npages =3 | |
for(i in 1:length(terms)){ | |
term_data[[i]] = scrape_term(terms[i],npages) | |
term_data[[i]] = cbind(term_data[[i]],rep(terms[i],npages*10)) | |
names(term_data[[i]])[4] = "term" | |
Sys.sleep(3) | |
cat(i) | |
} | |
term_vec = as.vector(sapply(term_data,function(x){x$term})) | |
## Put the term factor in order for the boxplot | |
term_vec = reorder(term_vec,rep(1:9,each=30)) | |
## Make the axis abbreviated by changing labels | |
levels(term_vec) = c("Emp. Proc.", "Prop. Haz.", "GLM", "Semi-param.","GEE","FDR","microarray","lasso","rna-seq") | |
pubyear_vec = as.vector(sapply(term_data,function(x){x$pub_year})) | |
title_vec = as.vector(sapply(term_data,function(x){x$title})) | |
## Create the plot | |
png(file="citations-boxplot.png",height=400,width=600) | |
par(bg="black",fg="white",col.axis="white", | |
col.lab="white",col.main="white", | |
mar=c(6,4,4,2)) | |
boxcol = "#20B2E3" | |
pointcol="white" | |
tmp = boxplot(pubyear_vec ~ term_vec2) | |
grid(nx=NA, ny=NULL) | |
boxplot(pubyear_vec ~ term_vec2,col=boxcol, | |
bty="n",xaxt="n",yaxt="n",ylab="year",main="Publication Year of First 30 G.S. Hits",frame.plot=FALSE) | |
stripchart(pubyear_vec ~ term_vec2,vertical=T,method="jitter",add=TRUE,pch=19,col=pointcol,cex=0.5) | |
axis(side=1,at=1:length(tmp$names),labels=tmp$names,tick=FALSE,las=2) | |
axis(side=2,at=at2,tick=FALSE) | |
add_simply_logo("black") | |
dev.off() | |
tapply(pubyear_vec,term_vec,function(x){mean(x,na.rm=T)}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
There is an issue with the XML package. A workaround would be
library(httr)
doc <- htmlParse(rawToChar(GET(url)$content))