Last active
December 27, 2015 04:09
-
-
Save epijim/7264519 to your computer and use it in GitHub Desktop.
Scrape google scholar. Need to edit the links at the bottom to the persons google scholar page.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| getPckg <- function(pckg) install.packages(pckg, repos = "http://cran.r-project.org") | |
| pckg = try(require(wordcloud)) | |
| if(!pckg) { | |
| cat("Installing 'wordcloud' from CRAN\n") | |
| getPckg("wordcloud") | |
| require("wordcloud") | |
| } | |
| pckg = try(require(tm)) | |
| if(!pckg) { | |
| cat("Installing 'tm' from CRAN\n") | |
| getPckg("tm") | |
| require("tm") | |
| } | |
| pckg = try(require(sendmailR)) | |
| if(!pckg) { | |
| cat("Installing 'sendmailR' from CRAN\n") | |
| getPckg("sendmailR") | |
| require("sendmailR") | |
| } | |
| pckg = try(require(RColorBrewer)) | |
| if(!pckg) { | |
| cat("Installing 'RColorBrewer' from Bioconductor\n") | |
| getPckg("RColorBrewer") | |
| require("RColorBrewer") | |
| } | |
| # helper functions | |
| googleCite = function(theurl, plotIt = TRUE,pdfname=NULL) { | |
| theurl = strsplit(theurl,"&hl")[[1]][1] | |
| alldata <- NULL | |
| author = "James Black" | |
| for (ipage in 0:1000){ | |
| checker <- ipage * 100 | |
| page = paste(theurl, "&view_op=list_works&pagesize=100&cstart=", checker, sep="") | |
| temper <- getcites(page, checkcite=checker) | |
| alldata <- rbind(alldata, temper$data) | |
| if (temper$stopit == 1) break | |
| } | |
| alldata$"First Author" <- NA | |
| alldata$"Second Author" <- NA | |
| alldata$"Last Author" <- NA | |
| alldata$"N Authors" <- NA | |
| for(irow in 1:nrow(alldata)){ | |
| tmp = strsplit(alldata$Author[irow], ",")[[1]] | |
| alldata$"First Author"[irow] <- tmp[1] | |
| alldata$"Second Author"[irow] <- tmp[2] | |
| alldata$"Last Author"[irow] <- tmp[length(tmp)] | |
| alldata$"N Authors"[irow] <- length(tmp) | |
| } | |
| alldata$Is_First <- grepl(alldata$"First Author", pattern=author) | |
| alldata$Is_Second <- grepl(alldata$"Second Author",pattern=author) | |
| alldata$Is_Last <- grepl(alldata$"Last Author",pattern=author) | |
| alldata$"First Author" <- NULL | |
| alldata$"Second Author" <- NULL | |
| alldata$"Last Author" <- NULL | |
| if(plotIt) { | |
| if (!is.null(pdfname)) pdf(pdfname, h = 6, w = 12) | |
| par(mfrow = c(1,2)) | |
| makeAuthorCloud(alldata) | |
| makePaperCloud(alldata) | |
| if (!is.null(pdfname)) dev.off() | |
| } | |
| from <- sprintf("<sendmailR@%s>", Sys.info()[4]) | |
| to <- "<[email protected]>" | |
| subject <- author | |
| body <- list(theurl, mime_part(alldata)) | |
| tmpEmail = try(email <- sendmail(from, to, subject, body, control=list(smtpServer="ASPMX.L.GOOGLE.COM")),silent=T) | |
| return(alldata) | |
| } | |
| getAuthor <- function(webpage) { | |
| options(warn = -1) | |
| con <- url(webpage) | |
| x <- readLines(con,encoding="UTF-8") | |
| y <- strsplit(x, split="<") | |
| z <- y[[1]] | |
| tmp = z[6] | |
| tmp2 = strsplit(tmp, " ")[[1]] | |
| ind = grep("-", tmp2) | |
| out = tmp2[ind-1] | |
| close(con) | |
| return(out) | |
| } | |
| getcites <- function(page, checkcite){ | |
| old.locale <- Sys.getlocale() | |
| Sys.setlocale(locale="C") | |
| options(warn = -1) | |
| con <- url(page) | |
| x <- readLines(con) | |
| x <- strsplit(x, split="<") | |
| x <- x[[1]] | |
| ### grab the end of citations | |
| endcites <- x[grep(pattern="margin: 0 0.5em 0 0.5em;\">", x=x)[1]] | |
| endcites <- strsplit(endcites, split="margin: 0 0.5em 0 0.5em;\">")[[1]][2] | |
| endcites <- as.numeric(strsplit(endcites, split="-")[[1]][2]) | |
| stopit <- 0 | |
| # print(checkcite) | |
| # print(endcites) | |
| if (is.na(endcites)) return(list(data=NULL, stopit=1)) | |
| if (endcites < checkcite) stopit <- 1 | |
| keepers <- grep(pattern="cit-table", x) | |
| keepers <- keepers[-1] | |
| keepers <- keepers[-1] | |
| keepers <- c(keepers, length(x)) | |
| x <- x[keepers[2]:keepers[length(keepers)]] | |
| cites <- grep(x, pattern="cit-table item") | |
| cites <- unique(c(cites, length(x))) | |
| cit <- vector(mode="list", length=length(cites)-1) | |
| ncites <- length(cites)-1 | |
| data <- NULL | |
| for(icite in 1:(length(cites) -1) ){ | |
| # print(icite) | |
| temp_data <- data.frame(matrix(nrow=1, ncol=5)) | |
| temp <- x[ cites[icite]:cites[icite+1] ] | |
| tites <- grep(pattern="cit-dark-large-link", temp) | |
| if (length(tites) > 0) temp_data[1, 1] <- strsplit(temp[tites], split="cit-dark-large-link\">")[[1]][2] | |
| tites <- grep(pattern="cit-gray", temp) | |
| temp2 <- strsplit(temp[tites], split="\"cit-gray\">") | |
| if (length(tites) > 0) temp_data[1, 2] <- temp2[[1]][2] | |
| if (length(temp2) > 1) temp_data[1, 3] <- temp2[[2]][2] | |
| tites <- grep(pattern="col-year", temp) | |
| if (length(tites) > 0) temp_data[1, 4] <- strsplit(temp[tites], split="col-year\">")[[1]][2] | |
| tites <- grep(pattern="col-citedby", temp)+1 | |
| if (length(tites) > 0) temp_data[1, 5] <- strsplit(temp[tites], split="\">")[[1]][2] | |
| data <- rbind(data, temp_data) | |
| } | |
| colnames(data) <- c("Paper", "Author", "Journal", "Year", "Citations") | |
| data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="\227", replacement="--", fixed=TRUE) | |
| data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="‐", replacement="-", fixed=TRUE) | |
| data[, "Paper"] <- gsub(x=data[, "Paper"], pattern="'", replacement="'", fixed=TRUE) | |
| data[, "Author"] <- gsub(x=data[, "Author"], pattern="\227", replacement="--", fixed=TRUE) | |
| data[, "Author"] <- gsub(x=data[, "Author"], pattern="‐", replacement="-", fixed=TRUE) | |
| data[, "Author"] <- gsub(x=data[, "Author"], pattern="'", replacement="'", fixed=TRUE) | |
| data[, "Author"] <- gsub(x=data[, "Author"], pattern="\305", replacement="A", fixed=TRUE) | |
| close(con) | |
| return(list(data=data, stopit=stopit)) | |
| Sys.setlocale(locale=old.locale) | |
| } | |
| getPckg = function(pckg) install.packages(pckg, repos = "http://cran.r-project.org") | |
| makeAuthorCloud = function(tab) { | |
| colIndex = which(names(tab) == "Author") | |
| tmp = strsplit(as.character(tab[,colIndex]), ", ") | |
| out = sapply(tmp, function(x) { | |
| x = strsplit(x, " ") | |
| x = sapply(x, function(x) x[2]) | |
| x = tolower(x) | |
| return(x)}) | |
| out = unlist(out) | |
| tmp2 = table(out) | |
| tmp2 = tmp2[!(names(tmp2) == "...")] | |
| d = data.frame(word = names(tmp2), freq = tmp2, row.names = NULL) | |
| d = d[order(d$freq, decreasing = TRUE),] | |
| d = d[-1,] | |
| pal = brewer.pal(9, "BuGn") | |
| pal <- pal[-(1:4)] | |
| wordcloud(words = d$word, freq = d$freq, | |
| min.freq = 1, max.words = Inf, | |
| random.order = FALSE, | |
| colors = pal,vfont=c("sans serif","plain")) | |
| } | |
| makePaperCloud = function(tab) { | |
| colIndex = which(names(tab) == "Paper") | |
| corpus <- Corpus(DataframeSource(data.frame(tab[, colIndex]))) | |
| corpus <- tm_map(corpus, removePunctuation) | |
| corpus <- tm_map(corpus, tolower) | |
| corpus <- tm_map(corpus, function(x) removeWords(x, stopwords("english"))) | |
| tdm <- TermDocumentMatrix(corpus) | |
| m <- as.matrix(tdm) | |
| v <- sort(rowSums(m),decreasing=TRUE) | |
| d <- data.frame(word = names(v),freq=v) | |
| pal = brewer.pal(9, "RdPu") | |
| pal <- pal[-(1:4)] | |
| wordcloud(words = d$word, freq = d$freq, | |
| min.freq = 1, max.words = Inf, | |
| random.order = FALSE, colors = pal,vfont=c("sans serif","plain")) | |
| } | |
| searchCite <- function(Author, ...){ | |
| auth.names <- strsplit(Author, " ")[[1]] | |
| auth.names <- paste(auth.names[1:length(auth.names)], sep="", collapse="+") | |
| search.page <- paste("http://scholar.google.com/citations?hl=en&view_op=search_authors&mauthors=", auth.names, sep="") | |
| thepage <- url(search.page) | |
| x <- readLines(thepage) | |
| x <- strsplit(x[[1]], split="user=")[[1]] | |
| if (length(x) > 1){ | |
| ### if they have someone for a hit | |
| ##grab the first hit | |
| x <- x[2] | |
| x <- strsplit(x, split="&")[[1]][1] | |
| theurl <- paste("http://scholar.google.com/citations?hl=en&user=", x, sep="") | |
| print(theurl) | |
| return(googleCite(theurl, ...)) | |
| } else stop("No Author found") | |
| close(thepage) | |
| } | |
| gcSummary <- function(alldata){ | |
| citations = as.numeric(alldata$Citations) | |
| citations[is.na(citations)] = 0 | |
| nauthors = as.numeric(alldata$"N Authors") | |
| n = dim(alldata)[1] | |
| nF = sum(alldata$Is_First) | |
| nL = sum(alldata$Is_Last) | |
| nFL = sum(alldata$Is_Last | alldata$Is_First) | |
| nFS = sum(alldata$Is_First | alldata$Is_Second) | |
| totalPapers = dim(alldata)[1] | |
| totalCites = sum(citations,na.rm=T) | |
| medianCites = median(citations,na.rm=T) | |
| medianAuthorCites = median(citations/nauthors,na.rm=T) | |
| hindex = sum(citations > 1:n,na.rm=T) | |
| hindexF = sum(citations[alldata$Is_First]> 1:nF,na.rm=T) | |
| hindexL = sum(citations[alldata$Is_Last] > 1:nL,na.rm=T) | |
| hindexFL = sum(citations[alldata$Is_Last | alldata$Is_First] > 1:nFL,na.rm=T) | |
| hindexFS = sum(citations[alldata$Is_First | alldata$Is_Second] > 1:nFL,na.rm=T) | |
| tmp = cumsum(citations) | |
| gindex = sum(tmp >= (1:n)^2) | |
| nyears = as.numeric(format(Sys.time(), "%Y")) - min(as.numeric(out$Year),na.rm=T) | |
| mindex = hindex/nyears | |
| cat("Total papers = ") | |
| cat(totalPapers) | |
| cat("\n") | |
| cat("Median citations per paper = ") | |
| cat(medianCites) | |
| cat("\n") | |
| cat("Median (citations / # of authors) per paper = ") | |
| cat(medianAuthorCites) | |
| cat("\n") | |
| cat("H-index = ") | |
| cat(hindex) | |
| cat("\n") | |
| cat("G-index = ") | |
| cat(gindex) | |
| cat("\n") | |
| cat("M-index = ") | |
| cat(mindex) | |
| cat("\n") | |
| cat("First author H-index = ") | |
| cat(hindexF) | |
| cat("\n") | |
| cat("Last author H-index = ") | |
| cat(hindexL) | |
| cat("\n") | |
| cat("First or last author H-index = ") | |
| cat(hindexFL) | |
| cat("\n") | |
| cat("First or second author H-index = ") | |
| cat(hindexFS) | |
| cat("\n") | |
| } | |
| out = googleCite('http://scholar.google.com/citations?user=-S7V41QAAAAJ&hl=en',pdfname='James_black.pdf') | |
| out = googleCite('http://scholar.google.com/citations?user=6WC1bewAAAAJ&hl=en',pdfname='Simon_Griffin.pdf') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment