Last active
October 27, 2015 21:04
-
-
Save timcdlucas/78478755b49e5c6342c4 to your computer and use it in GitHub Desktop.
Scrape pubmed or scholar
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
# Web scraping. | |
library(rvest) | |
# For synonym list | |
library(taxize) | |
scrapePub <- function(sp){ | |
Sys.sleep(2) | |
# Initialise refs | |
refs <- NA | |
# Find synonyms from taxize | |
syns <- synonyms(sp, db = 'itis') | |
if(NROW(syns[[1]]) == 1){ | |
spString <- tolower(gsub(' ', '%20', sp)) | |
} else { | |
spString <- paste(tolower(gsub(' ', '%20', syns[[1]]$syn_name)), collapse = '%22+OR+%22') | |
} | |
url <- paste0('http://www.ncbi.nlm.nih.gov/pubmed/?term=%22', spString, '%22') | |
page <- html(url) | |
# Test if exact phrase was found. | |
phraseFound <- try(page %>% | |
html_node('.icon') %>% | |
html_text() %>% | |
grepl("The following term was not found in PubMed:", .), silent = TRUE) | |
if (class(phraseFound) == "logical") { | |
if(phraseFound){ | |
if(phraseFound) refs <- NA | |
} | |
} | |
if (class(phraseFound) != "logical") { | |
try({ | |
refs <- page %>% | |
html_node('.result_count') %>% | |
html_text() %>% | |
strsplit(' ') %>% | |
.[[1]] %>% | |
.[length(.)] %>% | |
as.numeric() | |
}) | |
} | |
return(refs) | |
} | |
scrapeScholar <- function(sp){ | |
wait <- rnorm(1, 120, 2) | |
Sys.sleep(wait) | |
syns <- synonyms(sp, db = 'itis') | |
if(NROW(syns[[1]]) == 1){ | |
spString <- tolower(gsub(' ', '%20', sp)) | |
} else { | |
spString <- paste(tolower(gsub(' ', '%20', syns[[1]]$syn_name)), collapse = '%22+OR+%22') | |
} | |
url <- paste0('https://scholar.google.co.uk/scholar?hl=en&q=%22', | |
spString, '%22&btnG=&as_sdt=1%2C5&as_sdtp=') | |
page <- html(url) | |
try({ | |
refs <- page %>% | |
html_node('#gs_ab_md') %>% | |
html_text() %>% | |
gsub('About\\s(.*)\\sresults.*', '\\1', .) %>% | |
gsub(',', '', .) %>% | |
as.numeric | |
}) | |
return(refs) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
for scrapeScholar:
the '#gs_ab_md' node is not always the same. When there are less than ten results the text will not say "about X" results" it will only say "X results".
I changed the gsub for a strapply call from the gsubfn package
refs <- page %>%
html_node('#gs_ab_md') %>%
html_text() %>%
strapplyc( "(\w+) results ") %>%
as.numeric
Now I'll go try it out
cheers