Last active
September 25, 2020 05:10
-
-
Save flovv/63e79a3149729b57d0397bb22a589856 to your computer and use it in GitHub Desktop.
scrapeGoogleImages_file1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var url ='https://www.google.de/search?q=Yahoo+logo&source=lnms&tbm=isch&sa=X'; | |
var page = new WebPage() | |
var fs = require('fs'); | |
var vWidth = 1080; | |
var vHeight = 1920; | |
page.viewportSize = { | |
width: vWidth , | |
height: vHeight | |
}; | |
//Scroll throu! | |
var s = 0; | |
var sBase = page.evaluate(function () { return document.body.scrollHeight; }); | |
page.scrollPosition = { | |
top: sBase, | |
left: 0 | |
}; | |
function sc() { | |
var sBase2 = page.evaluate(function () { return document.body.scrollHeight; }); | |
if (sBase2 != sBase) { | |
sBase = sBase2; | |
} | |
if (s> sBase) { | |
page.viewportSize = {width: vWidth, height: vHeight}; | |
return; | |
} | |
page.scrollPosition = { | |
top: s, | |
left: 0 | |
}; | |
page.viewportSize = {width: vWidth, height: s}; | |
s += Math.min(sBase/20,400); | |
setTimeout(sc, 110); | |
} | |
function just_wait() { | |
setTimeout(function() { | |
fs.write('1.html', page.content, 'w'); | |
phantom.exit(); | |
}, 2500); | |
} | |
page.open(url, function (status) { | |
sc(); | |
just_wait(); | |
}); | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(plyr) | |
library(reshape2) | |
require(rvest) | |
scrapeJSSite <- function(searchTerm){ | |
url <- paste0("https://www.google.de/search?q=",searchTerm, "&source=lnms&tbm=isch&sa=X") | |
lines <- readLines("imageScrape.js") | |
lines[1] <- paste0("var url ='", url ,"';") | |
writeLines(lines, "imageScrape.js") | |
## Download website | |
system("phantomjs imageScrape.js") | |
pg <- read_html("1.html") | |
files <- pg %>% html_nodes("img") %>% html_attr("src") | |
df <- data.frame(images=files, search=searchTerm) | |
return(df) | |
} | |
downloadImages <- function(files, brand, outPath="images"){ | |
for(i in 1:length(files)){ | |
download.file(files[i], destfile = paste0(outPath, "/", brand, "_", i, ".jpg"), mode = 'wb') | |
} | |
} | |
### exchange the search terms here! | |
gg <- scrapeJSSite(searchTerm = "Adidas+logo") | |
downloadImages(as.character(gg$images), i) | |
Author
flovv
commented
Dec 29, 2019
via email
Have you seen that Link: http://flovv.github.io/scrape_images_google/
I assume that you have to set the scroll parameter var s = 0;such that
phantom scrolls for a couple of pages.
Note, this might increase page load time.
…On Thu, Dec 26, 2019 at 9:31 AM ArindamRouth ***@***.***> wrote:
How to Download more than 20 images? Please help
—
You are receiving this because you authored the thread.
Reply to this email directly, view it on GitHub
<https://gist.github.com/63e79a3149729b57d0397bb22a589856?email_source=notifications&email_token=AASCD4TTT5HXJYXNLGPXCM3Q2RTU5A5CNFSM4J7JN2M2YY3PNVWWK3TUL52HS4DFVNDWS43UINXW23LFNZ2KUY3PNVWWK3TUL5UWJTQAF6PK6#gistcomment-3120815>,
or unsubscribe
<https://github.com/notifications/unsubscribe-auth/AASCD4TVPGZONEXMNH5KYJTQ2RTU5ANCNFSM4J7JN2MQ>
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment