-
-
Save smoll/da91baed46465056ccf0d315ef2734cd to your computer and use it in GitHub Desktop.
scrapeGoogleImages_file1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var url ='https://www.google.de/search?q=Yahoo+logo&source=lnms&tbm=isch&sa=X'; | |
var page = new WebPage() | |
var fs = require('fs'); | |
var vWidth = 1080; | |
var vHeight = 1920; | |
page.viewportSize = { | |
width: vWidth , | |
height: vHeight | |
}; | |
//Scroll throu! | |
var s = 0; | |
var sBase = page.evaluate(function () { return document.body.scrollHeight; }); | |
page.scrollPosition = { | |
top: sBase, | |
left: 0 | |
}; | |
function sc() { | |
var sBase2 = page.evaluate(function () { return document.body.scrollHeight; }); | |
if (sBase2 != sBase) { | |
sBase = sBase2; | |
} | |
if (s> sBase) { | |
page.viewportSize = {width: vWidth, height: vHeight}; | |
return; | |
} | |
page.scrollPosition = { | |
top: s, | |
left: 0 | |
}; | |
page.viewportSize = {width: vWidth, height: s}; | |
s += Math.min(sBase/20,400); | |
setTimeout(sc, 110); | |
} | |
function just_wait() { | |
setTimeout(function() { | |
fs.write('1.html', page.content, 'w'); | |
phantom.exit(); | |
}, 2500); | |
} | |
page.open(url, function (status) { | |
sc(); | |
just_wait(); | |
}); | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(plyr) | |
library(reshape2) | |
require(rvest) | |
scrapeJSSite <- function(searchTerm){ | |
url <- paste0("https://www.google.de/search?q=",searchTerm, "&source=lnms&tbm=isch&sa=X") | |
lines <- readLines("imageScrape.js") | |
lines[1] <- paste0("var url ='", url ,"';") | |
writeLines(lines, "imageScrape.js") | |
## Download website | |
system("phantomjs imageScrape.js") | |
pg <- read_html("1.html") | |
files <- pg %>% html_nodes("img") %>% html_attr("src") | |
df <- data.frame(images=files, search=searchTerm) | |
return(df) | |
} | |
downloadImages <- function(files, brand, outPath="images"){ | |
for(i in 1:length(files)){ | |
download.file(files[i], destfile = paste0(outPath, "/", brand, "_", i, ".jpg"), mode = 'wb') | |
} | |
} | |
### exchange the search terms here! | |
gg <- scrapeJSSite(searchTerm = "Adidas+logo") | |
downloadImages(as.character(gg$images), i) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello Thank you for this nice post. I have a basic question: how to modify the scrapeGoogleImages.js code to be able to go beyong 20 images
Thank for your help