jtleek · September 3, 2013 14:07
diff --git a/googleCite.R b/googleCite.R
 #########################################################################################
 #   Some functions to quantify your Google Scholar citations page. 
 #   R functions Copyright (C) 2011 John Muschelli ([email protected]), Andrew Jaffe ([email protected]),
 #   Jeffrey Leek ([email protected]), and the Simply Statistics Blog
 #   (http://simplystatistics.tumblr.com, http://twitter.com/simplystats)
 #
 #   This program is free software: you can redistribute it and/or modify
 #   it under the terms of the GNU General Public License as published by
 #   the Free Software Foundation, either version 3 of the License, or
 #   (at your option) any later version.
 #
 #   This program is distributed in the hope that it will be useful,
 #   but WITHOUT ANY WARRANTY; without even the implied warranty of
 #   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #   GNU General Public License for more details, see <http://www.gnu.org/licenses/>.
 #
 #
 #   These functions depend on the packages: wordcloud, tm, sendmailR, and RColorBrewer. It will
 #   attempt to install them if they are not installed when you source this function. 
 #
 #
 #   How to use: 
 #       # Source the function
 #       source("http://biostat.jhsph.edu/~jleek/code/googleCite.r")
 #
 #       # Get the url for a scholar (this is the one for Rafa Irizarry: http://scholar.google.com/citations?user=nFW-2Q8AAAAJ&hl=en)
 #       # and run the googleCite function. You can choose to plot word clouds of the co-authors and paper titles by setting plotIt=TRUE
 #       # it will automatically produce a pdf file, if you want to set the name/location of this pdffile, set the pdfname="yourname_wordcloud.pdf"
 #       # When you run this function, your Google Scholar data will be sent to our email account, so that we can see who is running the function
 #       # and to perform population-level analyses. The variable out will contain a table with data from your Google Scholar citation page. 
 #        
 #       out <- googleCite("http://scholar.google.com/citations?user=nFW-2Q8AAAAJ&hl=en", pdfname="rafa_cloud.pdf")
 # 
 #
 #       # To calculate some popular citation indices you can now apply gcSummary to the output
 #       gcSummary(out)
 #
 #
 #       # You can also search for a specific individual by name using the function searchCite
 #
 #       out2 <- searchCite("Rafa Irizarry", pdfname="rafa_cloud.pdf")
 #
 ########################################################################################


 getPckg <- function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")

 pckg = try(require(wordcloud))
 if(!pckg) {
 cat("Installing 'wordcloud' from CRAN\n")
 getPckg(wordcloud)
 require(wordcloud)
 }
 pckg = try(require(tm))
 if(!pckg) {
 cat("Installing 'tm' from CRAN\n")
 getPckg("tm")
 require("tm")
 }
 pckg = try(require(sendmailR))
 if(!pckg) {
 cat("Installing 'sendmailR' from CRAN\n")
 getPckg("sendmailR")
 require("sendmailR")
 }
 pckg = try(require(RColorBrewer))
 if(!pckg) {
 cat("Installing 'RColorBrewer' from Bioconductor\n")
 getPckg("RColorBrewer")
 require("RColorBrewer")
 }

 # helper functions
 googleCite = function(theurl, plotIt = TRUE,pdfname=NULL) {

  theurl = strsplit(theurl,"&hl")[[1]][1]
  alldata <- NULL

  author = getAuthor(paste(theurl,"&view_op=list_works&pagesize=100&cstart=",0,sep=""))
  
  for (ipage in 0:1000){
    checker <- ipage * 100
    page = paste(theurl, "&view_op=list_works&pagesize=100&cstart=", checker, sep="")
    temper <- getcites(page, checkcite=checker)
    alldata <- rbind(alldata, temper$data)
    if (temper$stopit == 1) break
  }
  
  alldata$"First Author" <- NA
  alldata$"Second Author" <- NA
  alldata$"Last Author" <- NA
  alldata$"N Authors" <- NA
  
  for(irow in 1:nrow(alldata)){
    tmp = strsplit(alldata$Author[irow], ",")[[1]]
    
    alldata$"First Author"[irow] <- tmp[1]
    alldata$"Second Author"[irow] <- tmp[2]
    alldata$"Last Author"[irow] <- tmp[length(tmp)]
    alldata$"N Authors"[irow] <- length(tmp)
    
  }
 
  alldata$Is_First <- grepl(alldata$"First Author", pattern=author)
  alldata$Is_Second <- grepl(alldata$"Second Author",pattern=author)
  alldata$Is_Last <- grepl(alldata$"Last Author",pattern=author)

  alldata$"First Author" <- NULL
  alldata$"Second Author" <- NULL
  alldata$"Last Author" <- NULL
  
  
  if(plotIt) {
    if (!is.null(pdfname)) pdf(pdfname, h = 6, w = 12)
    
    par(mfrow = c(1,2))
    makeAuthorCloud(alldata)
    makePaperCloud(alldata)
    
    if (!is.null(pdfname)) dev.off()
  }

  from <- sprintf("<sendmailR@%s>", Sys.info()[4])
  to <- "<[email protected]>"
  subject <- author
  body <- list(theurl, mime_part(alldata))
  tmpEmail = try(email <- sendmail(from, to, subject, body, control=list(smtpServer="ASPMX.L.GOOGLE.COM")),silent=T)
  
  return(alldata)
 }

 getAuthor <- function(webpage) {
  options(warn = -1)
  con <- url(webpage)
  x <- readLines(con,encoding="UTF-8")
  y <- strsplit(x, split="<")
  z <- y[[1]]
  out <- paste(strsplit(strsplit(z[5],"title>")[[1]][2]," ")[[1]][1:2],collapse=" ")
  close(con)
  return(out)
 }

 getcites <- function(page, checkcite){
  old.locale <- Sys.getlocale()
  Sys.setlocale(locale="C")
  options(warn = -1)
  con <- url(page)
  x <- readLines(con)
  x <- strsplit(x, split="<")
  x <- x[[1]]
  
 ### grab the end of citations
  endcites <- x[grep(pattern="margin: 0 0.5em 0 0.5em;\">", x=x)[1]]
  endcites <- strsplit(endcites, split="margin: 0 0.5em 0 0.5em;\">")[[1]][2]
  endcites <- as.numeric(strsplit(endcites, split="-")[[1]][2])
  
  stopit <- 0
  # print(checkcite)
  # print(endcites)
  if (is.na(endcites)) return(list(data=NULL, stopit=1))
  if (endcites < checkcite) stopit <- 1
  
  keepers <- grep(pattern="cit-table", x)
  keepers <- keepers[-1]
  keepers <- keepers[-1]
  keepers <- c(keepers, length(x))
  x <- x[keepers[2]:keepers[length(keepers)]]
  cites <- grep(x, pattern="cit-table item")
  cites <- unique(c(cites, length(x)))
  cit <- vector(mode="list", length=length(cites)-1)
  ncites <- length(cites)-1
  
  data <- NULL
  
  for(icite in 1:(length(cites) -1) ){
                                        # print(icite)
    temp_data <- data.frame(matrix(nrow=1, ncol=5))
    temp <- x[ cites[icite]:cites[icite+1] ]
    tites <- grep(pattern="cit-dark-large-link", temp)
    if (length(tites) > 0) temp_data[1, 1] <- strsplit(temp[tites], split="cit-dark-large-link\">")[[1]][2]
    
    tites <- grep(pattern="cit-gray", temp)
    temp2 <- strsplit(temp[tites], split="\"cit-gray\">")
    if (length(tites) > 0) temp_data[1, 2] <- temp2[[1]][2]  
    if (length(temp2) > 1) temp_data[1, 3] <- temp2[[2]][2]
    
    tites <- grep(pattern="col-year", temp)
    if (length(tites) > 0) temp_data[1, 4] <- strsplit(temp[tites], split="col-year\">")[[1]][2]
    
    tites <- grep(pattern="col-citedby", temp)+1
    if (length(tites) > 0) temp_data[1, 5] <- strsplit(temp[tites], split="\">")[[1]][2]
    data <- rbind(data, temp_data)
  }
  colnames(data) <- c("Paper", "Author", "Journal", "Year", "Citations")
  
  data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="\227", replacement="--", fixed=TRUE)
  data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="&#8208;", replacement="-", fixed=TRUE)
  data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="&#39;", replacement="'", fixed=TRUE)
  
  data[, "Author"] <- gsub(x=data[, "Author"], pattern="\227", replacement="--", fixed=TRUE)
  data[, "Author"] <- gsub(x=data[, "Author"], pattern="&#8208;", replacement="-", fixed=TRUE)
  data[, "Author"] <- gsub(x=data[, "Author"], pattern="&#39;", replacement="'", fixed=TRUE)
  data[, "Author"] <- gsub(x=data[, "Author"], pattern="\305", replacement="A", fixed=TRUE)
  
  close(con)
  return(list(data=data, stopit=stopit))
  Sys.setlocale(locale=old.locale)
  
 }

 getPckg = function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")

 makeAuthorCloud = function(tab) {

  colIndex = which(names(tab) == "Author")
  
  tmp = strsplit(as.character(tab[,colIndex]), ", ")
  out = sapply(tmp, function(x) {
 	x = strsplit(x, " ")
 	x = sapply(x, function(x) x[2])
 	x = tolower(x)
 	return(x)})
  out = unlist(out)
  tmp2 = table(out)
  tmp2 = tmp2[!(names(tmp2) == "...")]
  d = data.frame(word = names(tmp2), freq = tmp2, row.names = NULL)
  d = d[order(d$freq, decreasing = TRUE),]
  d = d[-1,]
  
  pal = brewer.pal(9, "BuGn") 
  pal <- pal[-(1:4)]
  
  wordcloud(words = d$word, freq = d$freq, 
            min.freq = 1, max.words = Inf,
            random.order = FALSE, 
 			colors = pal,vfont=c("sans serif","plain"))
 }

 makePaperCloud = function(tab) {

  colIndex = which(names(tab) == "Paper")
  
  corpus <- Corpus(DataframeSource(data.frame(tab[, colIndex])))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, tolower)
  corpus <- tm_map(corpus, function(x) removeWords(x, stopwords("english")))
  
  tdm <- TermDocumentMatrix(corpus)
  m <- as.matrix(tdm)
  v <- sort(rowSums(m),decreasing=TRUE)
  d <- data.frame(word = names(v),freq=v)
  
  pal = brewer.pal(9, "RdPu")
  pal <- pal[-(1:4)]
  
  wordcloud(words = d$word, freq = d$freq, 
            min.freq = 1, max.words = Inf,
            random.order = FALSE, colors = pal,vfont=c("sans serif","plain"))
 }

 searchCite <- function(Author, ...){
  auth.names <- strsplit(Author, " ")[[1]]
  auth.names <- paste(auth.names[1:length(auth.names)], sep="", collapse="+")
  
  search.page <- paste("http://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=", auth.names, sep="")
  thepage <- url(search.page)
  x <- readLines(thepage)
  x <- strsplit(x[[1]], split="user=")[[1]]
  if (length(x) > 1){
    ### if they have someone for a hit
    ##grab the first hit
    x <- x[2]
    x <- strsplit(x, split="&amp;")[[1]][1]
    theurl <- paste("http://scholar.google.com/citations?hl=en&user=", x, sep="")
    print(theurl)
    return(googleCite(theurl, ...))
  } else stop("No Author found")
  close(thepage)
 }

 gcSummary <- function(alldata){
  citations = as.numeric(alldata$Citations)
  citations[is.na(citations)] = 0
  nauthors = as.numeric(alldata$"N Authors")
  n = dim(alldata)[1]
  nF = sum(alldata$Is_First)
  nL = sum(alldata$Is_Last)
  nFL = sum(alldata$Is_Last | alldata$Is_First)
  nFS = sum(alldata$Is_First | alldata$Is_Second)

  totalPapers = dim(alldata)[1]
  totalCites = sum(citations,na.rm=T)
  medianCites = median(citations,na.rm=T)
  medianAuthorCites = median(citations/nauthors,na.rm=T)
  
  hindex = sum(citations > 1:n,na.rm=T)
  hindexF = sum(citations[alldata$Is_First]> 1:nF,na.rm=T)
  hindexL = sum(citations[alldata$Is_Last] > 1:nL,na.rm=T)
  hindexFL = sum(citations[alldata$Is_Last | alldata$Is_First] > 1:nFL,na.rm=T)
  hindexFS = sum(citations[alldata$Is_First | alldata$Is_Second] > 1:nFL,na.rm=T)
  
  tmp = cumsum(citations)
  
  gindex = sum(tmp >= (1:n)^2)

  nyears =  as.numeric(format(Sys.time(), "%Y")) - min(as.numeric(out$Year),na.rm=T)
  mindex = hindex/nyears
  
  cat("Total papers = ")
  cat(totalPapers)
  cat("\n")
  cat("Median citations per paper = ")
  cat(medianCites)
  cat("\n")
  cat("Median (citations / # of authors) per paper = ")
  cat(medianAuthorCites)
  cat("\n")
  cat("H-index = ")
  cat(hindex)
  cat("\n")
  cat("G-index = ")
  cat(gindex)
  cat("\n")
  cat("M-index = ")
  cat(mindex)
  cat("\n")
  cat("First author H-index = ")
  cat(hindexF)
  cat("\n")
  cat("Last author H-index = ")
  cat(hindexL)
  cat("\n")
  cat("First or last author H-index = ")
  cat(hindexFL)
  cat("\n")
  cat("First or second author H-index = ")
  cat(hindexFS)
  cat("\n")
  
 }
	#########################################################################################
	# Some functions to quantify your Google Scholar citations page.
	# R functions Copyright (C) 2011 John Muschelli ([email protected]), Andrew Jaffe ([email protected]),
	# Jeffrey Leek ([email protected]), and the Simply Statistics Blog
	# (http://simplystatistics.tumblr.com, http://twitter.com/simplystats)
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful,
	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	# GNU General Public License for more details, see <http://www.gnu.org/licenses/>.
	#
	#
	# These functions depend on the packages: wordcloud, tm, sendmailR, and RColorBrewer. It will
	# attempt to install them if they are not installed when you source this function.
	#
	#
	# How to use:
	# # Source the function
	# source("http://biostat.jhsph.edu/~jleek/code/googleCite.r")
	#
	# # Get the url for a scholar (this is the one for Rafa Irizarry: http://scholar.google.com/citations?user=nFW-2Q8AAAAJ&hl=en)
	# # and run the googleCite function. You can choose to plot word clouds of the co-authors and paper titles by setting plotIt=TRUE
	# # it will automatically produce a pdf file, if you want to set the name/location of this pdffile, set the pdfname="yourname_wordcloud.pdf"
	# # When you run this function, your Google Scholar data will be sent to our email account, so that we can see who is running the function
	# # and to perform population-level analyses. The variable out will contain a table with data from your Google Scholar citation page.
	#
	# out <- googleCite("http://scholar.google.com/citations?user=nFW-2Q8AAAAJ&hl=en", pdfname="rafa_cloud.pdf")
	#
	#
	# # To calculate some popular citation indices you can now apply gcSummary to the output
	# gcSummary(out)
	#
	#
	# # You can also search for a specific individual by name using the function searchCite
	#
	# out2 <- searchCite("Rafa Irizarry", pdfname="rafa_cloud.pdf")
	#
	########################################################################################


	getPckg <- function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")

	pckg = try(require(wordcloud))
	if(!pckg) {
	cat("Installing 'wordcloud' from CRAN\n")
	getPckg(wordcloud)
	require(wordcloud)
	}
	pckg = try(require(tm))
	if(!pckg) {
	cat("Installing 'tm' from CRAN\n")
	getPckg("tm")
	require("tm")
	}
	pckg = try(require(sendmailR))
	if(!pckg) {
	cat("Installing 'sendmailR' from CRAN\n")
	getPckg("sendmailR")
	require("sendmailR")
	}
	pckg = try(require(RColorBrewer))
	if(!pckg) {
	cat("Installing 'RColorBrewer' from Bioconductor\n")
	getPckg("RColorBrewer")
	require("RColorBrewer")
	}

	# helper functions
	googleCite = function(theurl, plotIt = TRUE,pdfname=NULL) {

	theurl = strsplit(theurl,"&hl")[[1]][1]
	alldata <- NULL

	author = getAuthor(paste(theurl,"&view_op=list_works&pagesize=100&cstart=",0,sep=""))

	for (ipage in 0:1000){
	checker <- ipage * 100
	page = paste(theurl, "&view_op=list_works&pagesize=100&cstart=", checker, sep="")
	temper <- getcites(page, checkcite=checker)
	alldata <- rbind(alldata, temper$data)
	if (temper$stopit == 1) break
	}

	alldata$"First Author" <- NA
	alldata$"Second Author" <- NA
	alldata$"Last Author" <- NA
	alldata$"N Authors" <- NA

	for(irow in 1:nrow(alldata)){
	tmp = strsplit(alldata$Author[irow], ",")[[1]]

	alldata$"First Author"[irow] <- tmp[1]
	alldata$"Second Author"[irow] <- tmp[2]
	alldata$"Last Author"[irow] <- tmp[length(tmp)]
	alldata$"N Authors"[irow] <- length(tmp)

	}

	alldata$Is_First <- grepl(alldata$"First Author", pattern=author)
	alldata$Is_Second <- grepl(alldata$"Second Author",pattern=author)
	alldata$Is_Last <- grepl(alldata$"Last Author",pattern=author)

	alldata$"First Author" <- NULL
	alldata$"Second Author" <- NULL
	alldata$"Last Author" <- NULL


	if(plotIt) {
	if (!is.null(pdfname)) pdf(pdfname, h = 6, w = 12)

	par(mfrow = c(1,2))
	makeAuthorCloud(alldata)
	makePaperCloud(alldata)

	if (!is.null(pdfname)) dev.off()
	}

	from <- sprintf("<sendmailR@%s>", Sys.info()[4])
	to <- "<[email protected]>"
	subject <- author
	body <- list(theurl, mime_part(alldata))
	tmpEmail = try(email <- sendmail(from, to, subject, body, control=list(smtpServer="ASPMX.L.GOOGLE.COM")),silent=T)

	return(alldata)
	}

	getAuthor <- function(webpage) {
	options(warn = -1)
	con <- url(webpage)
	x <- readLines(con,encoding="UTF-8")
	y <- strsplit(x, split="<")
	z <- y[[1]]
	out <- paste(strsplit(strsplit(z[5],"title>")[[1]][2]," ")[[1]][1:2],collapse=" ")
	close(con)
	return(out)
	}

	getcites <- function(page, checkcite){
	old.locale <- Sys.getlocale()
	Sys.setlocale(locale="C")
	options(warn = -1)
	con <- url(page)
	x <- readLines(con)
	x <- strsplit(x, split="<")
	x <- x[[1]]

	### grab the end of citations
	endcites <- x[grep(pattern="margin: 0 0.5em 0 0.5em;\">", x=x)[1]]
	endcites <- strsplit(endcites, split="margin: 0 0.5em 0 0.5em;\">")[[1]][2]
	endcites <- as.numeric(strsplit(endcites, split="-")[[1]][2])

	stopit <- 0
	# print(checkcite)
	# print(endcites)
	if (is.na(endcites)) return(list(data=NULL, stopit=1))
	if (endcites < checkcite) stopit <- 1

	keepers <- grep(pattern="cit-table", x)
	keepers <- keepers[-1]
	keepers <- keepers[-1]
	keepers <- c(keepers, length(x))
	x <- x[keepers[2]:keepers[length(keepers)]]
	cites <- grep(x, pattern="cit-table item")
	cites <- unique(c(cites, length(x)))
	cit <- vector(mode="list", length=length(cites)-1)
	ncites <- length(cites)-1

	data <- NULL

	for(icite in 1:(length(cites) -1) ){
	# print(icite)
	temp_data <- data.frame(matrix(nrow=1, ncol=5))
	temp <- x[ cites[icite]:cites[icite+1] ]
	tites <- grep(pattern="cit-dark-large-link", temp)
	if (length(tites) > 0) temp_data[1, 1] <- strsplit(temp[tites], split="cit-dark-large-link\">")[[1]][2]

	tites <- grep(pattern="cit-gray", temp)
	temp2 <- strsplit(temp[tites], split="\"cit-gray\">")
	if (length(tites) > 0) temp_data[1, 2] <- temp2[[1]][2]
	if (length(temp2) > 1) temp_data[1, 3] <- temp2[[2]][2]

	tites <- grep(pattern="col-year", temp)
	if (length(tites) > 0) temp_data[1, 4] <- strsplit(temp[tites], split="col-year\">")[[1]][2]

	tites <- grep(pattern="col-citedby", temp)+1
	if (length(tites) > 0) temp_data[1, 5] <- strsplit(temp[tites], split="\">")[[1]][2]
	data <- rbind(data, temp_data)
	}
	colnames(data) <- c("Paper", "Author", "Journal", "Year", "Citations")

	data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="\227", replacement="--", fixed=TRUE)
	data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="‐", replacement="-", fixed=TRUE)
	data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="'", replacement="'", fixed=TRUE)

	data[, "Author"] <- gsub(x=data[, "Author"], pattern="\227", replacement="--", fixed=TRUE)
	data[, "Author"] <- gsub(x=data[, "Author"], pattern="‐", replacement="-", fixed=TRUE)
	data[, "Author"] <- gsub(x=data[, "Author"], pattern="'", replacement="'", fixed=TRUE)
	data[, "Author"] <- gsub(x=data[, "Author"], pattern="\305", replacement="A", fixed=TRUE)

	close(con)
	return(list(data=data, stopit=stopit))
	Sys.setlocale(locale=old.locale)

	}

	getPckg = function(pckg) install.packages(pckg, repos = "http://cran.r-project.org")

	makeAuthorCloud = function(tab) {

	colIndex = which(names(tab) == "Author")

	tmp = strsplit(as.character(tab[,colIndex]), ", ")
	out = sapply(tmp, function(x) {
	x = strsplit(x, " ")
	x = sapply(x, function(x) x[2])
	x = tolower(x)
	return(x)})
	out = unlist(out)
	tmp2 = table(out)
	tmp2 = tmp2[!(names(tmp2) == "...")]
	d = data.frame(word = names(tmp2), freq = tmp2, row.names = NULL)
	d = d[order(d$freq, decreasing = TRUE),]
	d = d[-1,]

	pal = brewer.pal(9, "BuGn")
	pal <- pal[-(1:4)]

	wordcloud(words = d$word, freq = d$freq,
	min.freq = 1, max.words = Inf,
	random.order = FALSE,
	colors = pal,vfont=c("sans serif","plain"))
	}

	makePaperCloud = function(tab) {

	colIndex = which(names(tab) == "Paper")

	corpus <- Corpus(DataframeSource(data.frame(tab[, colIndex])))
	corpus <- tm_map(corpus, removePunctuation)
	corpus <- tm_map(corpus, tolower)
	corpus <- tm_map(corpus, function(x) removeWords(x, stopwords("english")))

	tdm <- TermDocumentMatrix(corpus)
	m <- as.matrix(tdm)
	v <- sort(rowSums(m),decreasing=TRUE)
	d <- data.frame(word = names(v),freq=v)

	pal = brewer.pal(9, "RdPu")
	pal <- pal[-(1:4)]

	wordcloud(words = d$word, freq = d$freq,
	min.freq = 1, max.words = Inf,
	random.order = FALSE, colors = pal,vfont=c("sans serif","plain"))
	}

	searchCite <- function(Author, ...){
	auth.names <- strsplit(Author, " ")[[1]]
	auth.names <- paste(auth.names[1:length(auth.names)], sep="", collapse="+")

	search.page <- paste("http://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=", auth.names, sep="")
	thepage <- url(search.page)
	x <- readLines(thepage)
	x <- strsplit(x[[1]], split="user=")[[1]]
	if (length(x) > 1){
	### if they have someone for a hit
	##grab the first hit
	x <- x[2]
	x <- strsplit(x, split="&")[[1]][1]
	theurl <- paste("http://scholar.google.com/citations?hl=en&user=", x, sep="")
	print(theurl)
	return(googleCite(theurl, ...))
	} else stop("No Author found")
	close(thepage)
	}

	gcSummary <- function(alldata){
	citations = as.numeric(alldata$Citations)
	citations[is.na(citations)] = 0
	nauthors = as.numeric(alldata$"N Authors")
	n = dim(alldata)[1]
	nF = sum(alldata$Is_First)
	nL = sum(alldata$Is_Last)
	nFL = sum(alldata$Is_Last \| alldata$Is_First)
	nFS = sum(alldata$Is_First \| alldata$Is_Second)

	totalPapers = dim(alldata)[1]
	totalCites = sum(citations,na.rm=T)
	medianCites = median(citations,na.rm=T)
	medianAuthorCites = median(citations/nauthors,na.rm=T)

	hindex = sum(citations > 1:n,na.rm=T)
	hindexF = sum(citations[alldata$Is_First]> 1:nF,na.rm=T)
	hindexL = sum(citations[alldata$Is_Last] > 1:nL,na.rm=T)
	hindexFL = sum(citations[alldata$Is_Last \| alldata$Is_First] > 1:nFL,na.rm=T)
	hindexFS = sum(citations[alldata$Is_First \| alldata$Is_Second] > 1:nFL,na.rm=T)

	tmp = cumsum(citations)

	gindex = sum(tmp >= (1:n)^2)

	nyears = as.numeric(format(Sys.time(), "%Y")) - min(as.numeric(out$Year),na.rm=T)
	mindex = hindex/nyears

	cat("Total papers = ")
	cat(totalPapers)
	cat("\n")
	cat("Median citations per paper = ")
	cat(medianCites)
	cat("\n")
	cat("Median (citations / # of authors) per paper = ")
	cat(medianAuthorCites)
	cat("\n")
	cat("H-index = ")
	cat(hindex)
	cat("\n")
	cat("G-index = ")
	cat(gindex)
	cat("\n")
	cat("M-index = ")
	cat(mindex)
	cat("\n")
	cat("First author H-index = ")
	cat(hindexF)
	cat("\n")
	cat("Last author H-index = ")
	cat(hindexL)
	cat("\n")
	cat("First or last author H-index = ")
	cat(hindexFL)
	cat("\n")
	cat("First or second author H-index = ")
	cat(hindexFS)
	cat("\n")

	}