Last active
July 14, 2017 13:45
-
-
Save crazy4groovy/0b196ae90bef0ba6074c to your computer and use it in GitHub Desktop.
Scrape Google images search results
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
!function(document) { | |
var d = document.createElement('div'); | |
d.style.cssText='position: fixed; top: 1em; left: 1em; z-index: 1000; background-color: rgba(188,188,88,.5); padding: 1em; border-radius: 1em; max-height: 50%; max-width: 50%; overflow: scroll;' | |
d.ondblclick=d.remove; | |
document.body.appendChild(d); | |
var matches = []; | |
function eachEl(selector, cb) { | |
[].forEach.call(document.querySelectorAll(selector), cb); | |
} | |
var delay = +prompt('delay in ms', '250'); | |
var i = 0; | |
eachEl('a.rg_l', function(a) { | |
setTimeout(eachBigImg.bind(this, a), (i++)*delay); | |
}); | |
function eachBigImg(a) { | |
a.click(); | |
setTimeout(eachEl.bind(this, 'div.irc_bg div.irc_rimask a', eachImg), delay/1.5 | 0); | |
} | |
function eachImg(a) { | |
var href = decodeURIComponent(a.href); | |
var match = href.match(/imgurl=([^\&]+)/); | |
//match = match?match[1]:'unmatched!! '+href; | |
if (!match || ~matches.indexOf(match[1])) return; | |
//console.log(match[1]); | |
matches.push(match[1]); | |
d.innerHTML += match[1]+'<br>'; | |
} | |
}(document); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var readdir = require('recursive-readdir'); | |
var fs = require('fs'); | |
var co = require('co'); | |
/* | |
Simple script that removes trailing "garbage" from detectable images' filenames | |
I.e. Google Image file downloads via `wget --no-check-certificate -nc -T 4 -t 4 -i images.txt` | |
*/ | |
var dir = process.argv[2]; | |
if (!dir) { | |
console.log('Please specify a directory to process - Aborting.'); | |
return; | |
} | |
var img1 = /^.*\.(jpe?g|png|gif)$/i; | |
var img2 = /\.(jpe?g|png|gif)(.*)/i; | |
function renameFile(fileName) { | |
var matched = fileName.match(img2); | |
if (!matched) return; | |
var type = matched[1]; | |
var cruft = matched[2]; | |
let i = fileName.indexOf(cruft); | |
let goodName = fileName.substr(0, i); | |
// just make sure it's a unique name | |
i = goodName.indexOf(type); | |
goodName = goodName.substr(0, i) + '_.' + type; | |
fs.renameSync(fileName, goodName); | |
console.log(fileName, ' --> ', goodName); | |
} | |
co(function* init() { | |
var fileNames = yield readdir(dir); | |
fileNames.forEach(f => { | |
if (!img1.test(f)) renameFile(f); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
javascript:(function(){;!function(document)%7Bvar%20d=document.createElement(%22div%22);d.style.cssText=%22position:%20fixed;%20top:%201em;%20left:%201em;%20z-index:%201000;%20background-color:%20rgba(188,188,88,.5);%20padding:%201em;%20border-radius:%201em;%20max-height:%2050%25;%20max-width:%2050%25;%20overflow:%20scroll;%22;d.ondblclick=d.remove;document.body.appendChild(d);var%20matches=%5B%5D;function%20eachEl(selector,cb)%7B%5B%5D.forEach.call(document.querySelectorAll(selector),cb)%7Dvar%20delay=+prompt(%22delay%20in%20ms%22,%22250%22);var%20i=0;eachEl(%22a.rg_l%22,function(a)%7BsetTimeout(eachBigImg.bind(this,a),i++*delay)%7D);function%20eachBigImg(a)%7Ba.click();setTimeout(eachEl.bind(this,%22div.irc_bg%20div.irc_rimask%20a%22,eachImg),delay/1.5%7C0)%7Dfunction%20eachImg(a)%7Bvar%20href=decodeURIComponent(a.href);var%20match=href.match(/imgurl=(%5B%5E%5C&%5D+)/);if(!match%7C%7C~matches.indexOf(match%5B1%5D))return;matches.push(match%5B1%5D);d.innerHTML+=match%5B1%5D+%22%3Cbr%3E%22%7D%7D(document);})()