Skip to content

Instantly share code, notes, and snippets.

@dmarcelinobr
Last active August 29, 2015 14:13
Show Gist options
  • Save dmarcelinobr/a7b6fdc6044b25640f0e to your computer and use it in GitHub Desktop.
Save dmarcelinobr/a7b6fdc6044b25640f0e to your computer and use it in GitHub Desktop.
require(dplyr)
require(RCurl)
require(reshape2)
require(rvest)
require(pipeR)
getGoogleResults <- function (search.term, show = 10){
Trim <- function (x) gsub("^\\s+|\\s+$", "", x)
packages = c('rvest','dplyr','pipeR','reshape2')
lapply(packages,library,character.only = T)
search.term -> st
if ("&" %>>% grepl(x = st) == T) {
st <- gsub("\\&", "%26", search.term)
}
if ("\\/" %>>% grepl(x = st) == T) {
st <- gsub("\\/", "%2F", search.term)
}
st <- st %>>% (gsub("\\ ", "+", .))
url.name = paste0("https://www.google.com/search?q=", st)
show %>>% (ceiling(./10)) %>>%
(seq(from = 1, to = ., by = 1)) -> loops
paste0(url.name,"&start=",loops * 10) -> l_urls
search_url = c(url.name, l_urls)
data.frame(loop = 1:length(search_url),search_url) %>>%
tbl_df -> df_urls
data.frame() %>>% tbl_df -> all_df
for(s in 1:length(search_url)){
search_url[s] %>>% getURL %>>% htmlParse %>>%
html_nodes(".r a") %>>%
html_attrs %>>% unlist -> urls
names(urls) = NULL
urls %>>% (substr(x = ., start = 8, urls %>>% nchar)) -> urls
urls[!urls %in% ""] -> urls
urls %>>% (colsplit(string = ., "\\&", c("link", "key"))) %>>%
(.[, 1]) %>>% as.character -> urls
search_url[s] %>>% getURL %>>% htmlParse %>>% html_nodes(".r a") %>>%
html_text %>>% unlist -> name
data.frame(url = urls, url_name = name, google_url = search_url[s],
search_term = search.term) %>>% tbl_df -> df
all_df %>>% rbind(df) -> all_df
rm(df)
}
all_df[all_df$url %>>% grepl(pattern = "http|https"), ] -> all_df
all_df[1:show, ] -> all_df
Sys.time() -> all_df$search_time
colsplit(string = all_df$url_name,pattern = '\\||\\ - ',c('title','site')) %>>%
(cbind(.,all_df)) -> all_df
apply(all_df[,1:2],2,Trim) -> all_df[,1:2]
all_df[,c(3,1:2,4:7)] -> all_df
return(all_df)
}
getGoogleResults(search.term = "Petrobras", show = 74) -> data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment