epijim · December 27, 2015 04:09
diff --git a/scrape.r b/scrape.r
 getPckg <- function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")

 pckg = try(require(wordcloud))
 if(!pckg) {
  cat("Installing 'wordcloud' from CRAN\n")
  getPckg("wordcloud")
  require("wordcloud")
 }
 pckg = try(require(tm))
 if(!pckg) {
  cat("Installing 'tm' from CRAN\n")
  getPckg("tm")
  require("tm")
 }
 pckg = try(require(sendmailR))
 if(!pckg) {
  cat("Installing 'sendmailR' from CRAN\n")
  getPckg("sendmailR")
  require("sendmailR")
 }
 pckg = try(require(RColorBrewer))
 if(!pckg) {
  cat("Installing 'RColorBrewer' from Bioconductor\n")
  getPckg("RColorBrewer")
  require("RColorBrewer")
 }

 # helper functions
 googleCite = function(theurl, plotIt = TRUE,pdfname=NULL) {
  
  theurl = strsplit(theurl,"&hl")[[1]][1]
  alldata <- NULL
  
  author = "James Black"
  
  for (ipage in 0:1000){
    checker <- ipage * 100
    page = paste(theurl, "&view_op=list_works&pagesize=100&cstart=", checker, sep="")
    temper <- getcites(page, checkcite=checker)
    alldata <- rbind(alldata, temper$data)
    if (temper$stopit == 1) break
  }
  
  alldata$"First Author" <- NA
  alldata$"Second Author" <- NA
  alldata$"Last Author" <- NA
  alldata$"N Authors" <- NA
  
  for(irow in 1:nrow(alldata)){
    tmp = strsplit(alldata$Author[irow], ",")[[1]]
    
    alldata$"First Author"[irow] <- tmp[1]
    alldata$"Second Author"[irow] <- tmp[2]
    alldata$"Last Author"[irow] <- tmp[length(tmp)]
    alldata$"N Authors"[irow] <- length(tmp)
    
  }
  
  alldata$Is_First <- grepl(alldata$"First Author", pattern=author)
  alldata$Is_Second <- grepl(alldata$"Second Author",pattern=author)
  alldata$Is_Last <- grepl(alldata$"Last Author",pattern=author)
  
  alldata$"First Author" <- NULL
  alldata$"Second Author" <- NULL
  alldata$"Last Author" <- NULL
  
  
  if(plotIt) {
    if (!is.null(pdfname)) pdf(pdfname, h = 6, w = 12)
    
    par(mfrow = c(1,2))
    makeAuthorCloud(alldata)
    makePaperCloud(alldata)
    
    if (!is.null(pdfname)) dev.off()
  }
  
  from <- sprintf("<sendmailR@%s>", Sys.info()[4])
  to <- "<[email protected]>"
  subject <- author
  body <- list(theurl, mime_part(alldata))
  tmpEmail = try(email <- sendmail(from, to, subject, body, control=list(smtpServer="ASPMX.L.GOOGLE.COM")),silent=T)
  
  return(alldata)
 }

 getAuthor <- function(webpage) {
  options(warn = -1)
  con <- url(webpage)
  x <- readLines(con,encoding="UTF-8")
  y <- strsplit(x, split="<")
  z <- y[[1]]
  tmp = z[6]
  tmp2 = strsplit(tmp, " ")[[1]]
  ind = grep("-", tmp2)
  out = tmp2[ind-1]
  close(con)
  return(out)
 }

 getcites <- function(page, checkcite){
  old.locale <- Sys.getlocale()
  Sys.setlocale(locale="C")
  options(warn = -1)
  con <- url(page)
  x <- readLines(con)
  x <- strsplit(x, split="<")
  x <- x[[1]]
  
  ### grab the end of citations
  endcites <- x[grep(pattern="margin: 0 0.5em 0 0.5em;\">", x=x)[1]]
  endcites <- strsplit(endcites, split="margin: 0 0.5em 0 0.5em;\">")[[1]][2]
  endcites <- as.numeric(strsplit(endcites, split="-")[[1]][2])
  
  stopit <- 0
  # print(checkcite)
  # print(endcites)
  if (is.na(endcites)) return(list(data=NULL, stopit=1))
  if (endcites < checkcite) stopit <- 1
  
  keepers <- grep(pattern="cit-table", x)
  keepers <- keepers[-1]
  keepers <- keepers[-1]
  keepers <- c(keepers, length(x))
  x <- x[keepers[2]:keepers[length(keepers)]]
  cites <- grep(x, pattern="cit-table item")
  cites <- unique(c(cites, length(x)))
  cit <- vector(mode="list", length=length(cites)-1)
  ncites <- length(cites)-1
  
  data <- NULL
  
  for(icite in 1:(length(cites) -1) ){
    # print(icite)
    temp_data <- data.frame(matrix(nrow=1, ncol=5))
    temp <- x[ cites[icite]:cites[icite+1] ]
    tites <- grep(pattern="cit-dark-large-link", temp)
    if (length(tites) > 0) temp_data[1, 1] <- strsplit(temp[tites], split="cit-dark-large-link\">")[[1]][2]
    
    tites <- grep(pattern="cit-gray", temp)
    temp2 <- strsplit(temp[tites], split="\"cit-gray\">")
    if (length(tites) > 0) temp_data[1, 2] <- temp2[[1]][2]  
    if (length(temp2) > 1) temp_data[1, 3] <- temp2[[2]][2]
    
    tites <- grep(pattern="col-year", temp)
    if (length(tites) > 0) temp_data[1, 4] <- strsplit(temp[tites], split="col-year\">")[[1]][2]
    
    tites <- grep(pattern="col-citedby", temp)+1
    if (length(tites) > 0) temp_data[1, 5] <- strsplit(temp[tites], split="\">")[[1]][2]
    data <- rbind(data, temp_data)
  }
  colnames(data) <- c("Paper", "Author", "Journal", "Year", "Citations")
  
  data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="\227", replacement="--", fixed=TRUE)
  data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="&#8208;", replacement="-", fixed=TRUE)
  data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="&#39;", replacement="'", fixed=TRUE)
  
  data[, "Author"] <- gsub(x=data[, "Author"], pattern="\227", replacement="--", fixed=TRUE)
  data[, "Author"] <- gsub(x=data[, "Author"], pattern="&#8208;", replacement="-", fixed=TRUE)
  data[, "Author"] <- gsub(x=data[, "Author"], pattern="&#39;", replacement="'", fixed=TRUE)
  data[, "Author"] <- gsub(x=data[, "Author"], pattern="\305", replacement="A", fixed=TRUE)
  
  close(con)
  return(list(data=data, stopit=stopit))
  Sys.setlocale(locale=old.locale)
  
 }

 getPckg = function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")

 makeAuthorCloud = function(tab) {
  
  colIndex = which(names(tab) == "Author")
  
  tmp = strsplit(as.character(tab[,colIndex]), ", ")
  out = sapply(tmp, function(x) {
    x = strsplit(x, " ")
    x = sapply(x, function(x) x[2])
    x = tolower(x)
    return(x)})
  out = unlist(out)
  tmp2 = table(out)
  tmp2 = tmp2[!(names(tmp2) == "...")]
  d = data.frame(word = names(tmp2), freq = tmp2, row.names = NULL)
  d = d[order(d$freq, decreasing = TRUE),]
  d = d[-1,]
  
  pal = brewer.pal(9, "BuGn") 
  pal <- pal[-(1:4)]
  
  wordcloud(words = d$word, freq = d$freq, 
            min.freq = 1, max.words = Inf,
            random.order = FALSE, 
            colors = pal,vfont=c("sans serif","plain"))
 }

 makePaperCloud = function(tab) {
  
  colIndex = which(names(tab) == "Paper")
  
  corpus <- Corpus(DataframeSource(data.frame(tab[, colIndex])))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, tolower)
  corpus <- tm_map(corpus, function(x) removeWords(x, stopwords("english")))
  
  tdm <- TermDocumentMatrix(corpus)
  m <- as.matrix(tdm)
  v <- sort(rowSums(m),decreasing=TRUE)
  d <- data.frame(word = names(v),freq=v)
  
  pal = brewer.pal(9, "RdPu")
  pal <- pal[-(1:4)]
  
  wordcloud(words = d$word, freq = d$freq, 
            min.freq = 1, max.words = Inf,
            random.order = FALSE, colors = pal,vfont=c("sans serif","plain"))
 }

 searchCite <- function(Author, ...){
  auth.names <- strsplit(Author, " ")[[1]]
  auth.names <- paste(auth.names[1:length(auth.names)], sep="", collapse="+")
  
  search.page <- paste("http://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=", auth.names, sep="")
  thepage <- url(search.page)
  x <- readLines(thepage)
  x <- strsplit(x[[1]], split="user=")[[1]]
  if (length(x) > 1){
    ### if they have someone for a hit
    ##grab the first hit
    x <- x[2]
    x <- strsplit(x, split="&amp;")[[1]][1]
    theurl <- paste("http://scholar.google.com/citations?hl=en&user=", x, sep="")
    print(theurl)
    return(googleCite(theurl, ...))
  } else stop("No Author found")
  close(thepage)
 }

 gcSummary <- function(alldata){
  citations = as.numeric(alldata$Citations)
  citations[is.na(citations)] = 0
  nauthors = as.numeric(alldata$"N Authors")
  n = dim(alldata)[1]
  nF = sum(alldata$Is_First)
  nL = sum(alldata$Is_Last)
  nFL = sum(alldata$Is_Last | alldata$Is_First)
  nFS = sum(alldata$Is_First | alldata$Is_Second)
  
  totalPapers = dim(alldata)[1]
  totalCites = sum(citations,na.rm=T)
  medianCites = median(citations,na.rm=T)
  medianAuthorCites = median(citations/nauthors,na.rm=T)
  
  hindex = sum(citations > 1:n,na.rm=T)
  hindexF = sum(citations[alldata$Is_First]> 1:nF,na.rm=T)
  hindexL = sum(citations[alldata$Is_Last] > 1:nL,na.rm=T)
  hindexFL = sum(citations[alldata$Is_Last | alldata$Is_First] > 1:nFL,na.rm=T)
  hindexFS = sum(citations[alldata$Is_First | alldata$Is_Second] > 1:nFL,na.rm=T)
  
  tmp = cumsum(citations)
  
  gindex = sum(tmp >= (1:n)^2)
  
  nyears =  as.numeric(format(Sys.time(), "%Y")) - min(as.numeric(out$Year),na.rm=T)
  mindex = hindex/nyears
  
  cat("Total papers = ")
  cat(totalPapers)
  cat("\n")
  cat("Median citations per paper = ")
  cat(medianCites)
  cat("\n")
  cat("Median (citations / # of authors) per paper = ")
  cat(medianAuthorCites)
  cat("\n")
  cat("H-index = ")
  cat(hindex)
  cat("\n")
  cat("G-index = ")
  cat(gindex)
  cat("\n")
  cat("M-index = ")
  cat(mindex)
  cat("\n")
  cat("First author H-index = ")
  cat(hindexF)
  cat("\n")
  cat("Last author H-index = ")
  cat(hindexL)
  cat("\n")
  cat("First or last author H-index = ")
  cat(hindexFL)
  cat("\n")
  cat("First or second author H-index = ")
  cat(hindexFS)
  cat("\n")
  
 }
 out = googleCite('http://scholar.google.com/citations?user=-S7V41QAAAAJ&hl=en',pdfname='James_black.pdf') 
 out = googleCite('http://scholar.google.com/citations?user=6WC1bewAAAAJ&hl=en',pdfname='Simon_Griffin.pdf')
	getPckg <- function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")

	pckg = try(require(wordcloud))
	if(!pckg) {
	cat("Installing 'wordcloud' from CRAN\n")
	getPckg("wordcloud")
	require("wordcloud")
	}
	pckg = try(require(tm))
	if(!pckg) {
	cat("Installing 'tm' from CRAN\n")
	getPckg("tm")
	require("tm")
	}
	pckg = try(require(sendmailR))
	if(!pckg) {
	cat("Installing 'sendmailR' from CRAN\n")
	getPckg("sendmailR")
	require("sendmailR")
	}
	pckg = try(require(RColorBrewer))
	if(!pckg) {
	cat("Installing 'RColorBrewer' from Bioconductor\n")
	getPckg("RColorBrewer")
	require("RColorBrewer")
	}

	# helper functions
	googleCite = function(theurl, plotIt = TRUE,pdfname=NULL) {

	theurl = strsplit(theurl,"&hl")[[1]][1]
	alldata <- NULL

	author = "James Black"

	for (ipage in 0:1000){
	checker <- ipage * 100
	page = paste(theurl, "&view_op=list_works&pagesize=100&cstart=", checker, sep="")
	temper <- getcites(page, checkcite=checker)
	alldata <- rbind(alldata, temper$data)
	if (temper$stopit == 1) break
	}

	alldata$"First Author" <- NA
	alldata$"Second Author" <- NA
	alldata$"Last Author" <- NA
	alldata$"N Authors" <- NA

	for(irow in 1:nrow(alldata)){
	tmp = strsplit(alldata$Author[irow], ",")[[1]]

	alldata$"First Author"[irow] <- tmp[1]
	alldata$"Second Author"[irow] <- tmp[2]
	alldata$"Last Author"[irow] <- tmp[length(tmp)]
	alldata$"N Authors"[irow] <- length(tmp)

	}

	alldata$Is_First <- grepl(alldata$"First Author", pattern=author)
	alldata$Is_Second <- grepl(alldata$"Second Author",pattern=author)
	alldata$Is_Last <- grepl(alldata$"Last Author",pattern=author)

	alldata$"First Author" <- NULL
	alldata$"Second Author" <- NULL
	alldata$"Last Author" <- NULL


	if(plotIt) {
	if (!is.null(pdfname)) pdf(pdfname, h = 6, w = 12)

	par(mfrow = c(1,2))
	makeAuthorCloud(alldata)
	makePaperCloud(alldata)

	if (!is.null(pdfname)) dev.off()
	}

	from <- sprintf("<sendmailR@%s>", Sys.info()[4])
	to <- "<[email protected]>"
	subject <- author
	body <- list(theurl, mime_part(alldata))
	tmpEmail = try(email <- sendmail(from, to, subject, body, control=list(smtpServer="ASPMX.L.GOOGLE.COM")),silent=T)

	return(alldata)
	}

	getAuthor <- function(webpage) {
	options(warn = -1)
	con <- url(webpage)
	x <- readLines(con,encoding="UTF-8")
	y <- strsplit(x, split="<")
	z <- y[[1]]
	tmp = z[6]
	tmp2 = strsplit(tmp, " ")[[1]]
	ind = grep("-", tmp2)
	out = tmp2[ind-1]
	close(con)
	return(out)
	}

	getcites <- function(page, checkcite){
	old.locale <- Sys.getlocale()
	Sys.setlocale(locale="C")
	options(warn = -1)
	con <- url(page)
	x <- readLines(con)
	x <- strsplit(x, split="<")
	x <- x[[1]]

	### grab the end of citations
	endcites <- x[grep(pattern="margin: 0 0.5em 0 0.5em;\">", x=x)[1]]
	endcites <- strsplit(endcites, split="margin: 0 0.5em 0 0.5em;\">")[[1]][2]
	endcites <- as.numeric(strsplit(endcites, split="-")[[1]][2])

	stopit <- 0
	# print(checkcite)
	# print(endcites)
	if (is.na(endcites)) return(list(data=NULL, stopit=1))
	if (endcites < checkcite) stopit <- 1

	keepers <- grep(pattern="cit-table", x)
	keepers <- keepers[-1]
	keepers <- keepers[-1]
	keepers <- c(keepers, length(x))
	x <- x[keepers[2]:keepers[length(keepers)]]
	cites <- grep(x, pattern="cit-table item")
	cites <- unique(c(cites, length(x)))
	cit <- vector(mode="list", length=length(cites)-1)
	ncites <- length(cites)-1

	data <- NULL

	for(icite in 1:(length(cites) -1) ){
	# print(icite)
	temp_data <- data.frame(matrix(nrow=1, ncol=5))
	temp <- x[ cites[icite]:cites[icite+1] ]
	tites <- grep(pattern="cit-dark-large-link", temp)
	if (length(tites) > 0) temp_data[1, 1] <- strsplit(temp[tites], split="cit-dark-large-link\">")[[1]][2]

	tites <- grep(pattern="cit-gray", temp)
	temp2 <- strsplit(temp[tites], split="\"cit-gray\">")
	if (length(tites) > 0) temp_data[1, 2] <- temp2[[1]][2]
	if (length(temp2) > 1) temp_data[1, 3] <- temp2[[2]][2]

	tites <- grep(pattern="col-year", temp)
	if (length(tites) > 0) temp_data[1, 4] <- strsplit(temp[tites], split="col-year\">")[[1]][2]

	tites <- grep(pattern="col-citedby", temp)+1
	if (length(tites) > 0) temp_data[1, 5] <- strsplit(temp[tites], split="\">")[[1]][2]
	data <- rbind(data, temp_data)
	}
	colnames(data) <- c("Paper", "Author", "Journal", "Year", "Citations")

	data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="\227", replacement="--", fixed=TRUE)
	data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="‐", replacement="-", fixed=TRUE)
	data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="'", replacement="'", fixed=TRUE)

	data[, "Author"] <- gsub(x=data[, "Author"], pattern="\227", replacement="--", fixed=TRUE)
	data[, "Author"] <- gsub(x=data[, "Author"], pattern="‐", replacement="-", fixed=TRUE)
	data[, "Author"] <- gsub(x=data[, "Author"], pattern="'", replacement="'", fixed=TRUE)
	data[, "Author"] <- gsub(x=data[, "Author"], pattern="\305", replacement="A", fixed=TRUE)

	close(con)
	return(list(data=data, stopit=stopit))
	Sys.setlocale(locale=old.locale)

	}

	getPckg = function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")

	makeAuthorCloud = function(tab) {

	colIndex = which(names(tab) == "Author")

	tmp = strsplit(as.character(tab[,colIndex]), ", ")
	out = sapply(tmp, function(x) {
	x = strsplit(x, " ")
	x = sapply(x, function(x) x[2])
	x = tolower(x)
	return(x)})
	out = unlist(out)
	tmp2 = table(out)
	tmp2 = tmp2[!(names(tmp2) == "...")]
	d = data.frame(word = names(tmp2), freq = tmp2, row.names = NULL)
	d = d[order(d$freq, decreasing = TRUE),]
	d = d[-1,]

	pal = brewer.pal(9, "BuGn")
	pal <- pal[-(1:4)]

	wordcloud(words = d$word, freq = d$freq,
	min.freq = 1, max.words = Inf,
	random.order = FALSE,
	colors = pal,vfont=c("sans serif","plain"))
	}

	makePaperCloud = function(tab) {

	colIndex = which(names(tab) == "Paper")

	corpus <- Corpus(DataframeSource(data.frame(tab[, colIndex])))
	corpus <- tm_map(corpus, removePunctuation)
	corpus <- tm_map(corpus, tolower)
	corpus <- tm_map(corpus, function(x) removeWords(x, stopwords("english")))

	tdm <- TermDocumentMatrix(corpus)
	m <- as.matrix(tdm)
	v <- sort(rowSums(m),decreasing=TRUE)
	d <- data.frame(word = names(v),freq=v)

	pal = brewer.pal(9, "RdPu")
	pal <- pal[-(1:4)]

	wordcloud(words = d$word, freq = d$freq,
	min.freq = 1, max.words = Inf,
	random.order = FALSE, colors = pal,vfont=c("sans serif","plain"))
	}

	searchCite <- function(Author, ...){
	auth.names <- strsplit(Author, " ")[[1]]
	auth.names <- paste(auth.names[1:length(auth.names)], sep="", collapse="+")

	search.page <- paste("http://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=", auth.names, sep="")
	thepage <- url(search.page)
	x <- readLines(thepage)
	x <- strsplit(x[[1]], split="user=")[[1]]
	if (length(x) > 1){
	### if they have someone for a hit
	##grab the first hit
	x <- x[2]
	x <- strsplit(x, split="&")[[1]][1]
	theurl <- paste("http://scholar.google.com/citations?hl=en&user=", x, sep="")
	print(theurl)
	return(googleCite(theurl, ...))
	} else stop("No Author found")
	close(thepage)
	}

	gcSummary <- function(alldata){
	citations = as.numeric(alldata$Citations)
	citations[is.na(citations)] = 0
	nauthors = as.numeric(alldata$"N Authors")
	n = dim(alldata)[1]
	nF = sum(alldata$Is_First)
	nL = sum(alldata$Is_Last)
	nFL = sum(alldata$Is_Last \| alldata$Is_First)
	nFS = sum(alldata$Is_First \| alldata$Is_Second)

	totalPapers = dim(alldata)[1]
	totalCites = sum(citations,na.rm=T)
	medianCites = median(citations,na.rm=T)
	medianAuthorCites = median(citations/nauthors,na.rm=T)

	hindex = sum(citations > 1:n,na.rm=T)
	hindexF = sum(citations[alldata$Is_First]> 1:nF,na.rm=T)
	hindexL = sum(citations[alldata$Is_Last] > 1:nL,na.rm=T)
	hindexFL = sum(citations[alldata$Is_Last \| alldata$Is_First] > 1:nFL,na.rm=T)
	hindexFS = sum(citations[alldata$Is_First \| alldata$Is_Second] > 1:nFL,na.rm=T)

	tmp = cumsum(citations)

	gindex = sum(tmp >= (1:n)^2)

	nyears = as.numeric(format(Sys.time(), "%Y")) - min(as.numeric(out$Year),na.rm=T)
	mindex = hindex/nyears

	cat("Total papers = ")
	cat(totalPapers)
	cat("\n")
	cat("Median citations per paper = ")
	cat(medianCites)
	cat("\n")
	cat("Median (citations / # of authors) per paper = ")
	cat(medianAuthorCites)
	cat("\n")
	cat("H-index = ")
	cat(hindex)
	cat("\n")
	cat("G-index = ")
	cat(gindex)
	cat("\n")
	cat("M-index = ")
	cat(mindex)
	cat("\n")
	cat("First author H-index = ")
	cat(hindexF)
	cat("\n")
	cat("Last author H-index = ")
	cat(hindexL)
	cat("\n")
	cat("First or last author H-index = ")
	cat(hindexFL)
	cat("\n")
	cat("First or second author H-index = ")
	cat(hindexFS)
	cat("\n")

	}
	out = googleCite('http://scholar.google.com/citations?user=-S7V41QAAAAJ&hl=en',pdfname='James_black.pdf')
	out = googleCite('http://scholar.google.com/citations?user=6WC1bewAAAAJ&hl=en',pdfname='Simon_Griffin.pdf')