Created
May 28, 2012 16:09
-
-
Save Dakuan/2819896 to your computer and use it in GitHub Desktop.
Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// the scraper module | |
// you need to set it's gubbins with manipulate and get methods | |
var Scraper = function(){ | |
// public | |
setGubbins = function(gubb){ | |
gubbins = gubb; | |
}, | |
getContent = function(url, callback){ | |
console.log('loading page...') | |
var subPage = buildPage(); | |
subPage.open(url, function(status){ | |
if (status !== 'success') { | |
var fs = require('fs'); | |
fs.write(root + 'errors/' + createId() + '.err', url, 'w'); | |
console.log(status + ': Unable to load ' + url); | |
phantom.exit(); | |
return; | |
} | |
// pump in jquery | |
subPage.injectJs('jquery', function() {}); | |
gubbins.manipulate(subPage); | |
// delay the fetch to ensure the AJAX calls are all done after DOM manipulation | |
setTimeout(function(){ | |
gubbins.get(subPage, onComplete); | |
}, 200); | |
}); | |
}, | |
onComplete = function(dress){ | |
//var json = dress.title;//toJson(dress) | |
var json = toJson(dress); | |
console.log(json); | |
var fs = require('fs'); | |
fs.write(root + '/content/' + dress.id + '.json', json, 'w'); | |
quit(); | |
}, | |
// private | |
root = 'asos/dresses/', | |
count = 0, | |
quit = function(){ | |
console.log('exiting phantomjs'); | |
phantom.exit(); | |
}, | |
buildPage = function(){ | |
var page = require('webpage').create(); | |
page.onConsoleMessage = this.onConsoleMessage; | |
page.onError = this.onJsError; | |
return page; | |
}, | |
onJsError = function(msg, line, source){ | |
console.log('error> ' + msg + ' on line ' + line); | |
}, | |
gubbins = {}, | |
toJson = function serialize(obj){ | |
var returnVal; | |
if(obj != undefined){ | |
switch(obj.constructor) | |
{ | |
case Array: | |
var vArr="["; | |
for(var i=0;i<obj.length;i++) | |
{ | |
if(i>0) vArr += ","; | |
vArr += serialize(obj[i]); | |
} | |
vArr += "]" | |
return vArr; | |
case String: | |
returnVal = escape("'" + obj + "'"); | |
return returnVal; | |
case Number: | |
returnVal = isFinite(obj) ? obj.toString() : null; | |
return returnVal; | |
case Date: | |
returnVal = "#" + obj + "#"; | |
return returnVal; | |
default: | |
if(typeof obj == "object"){ | |
var vobj=[]; | |
for(attr in obj) | |
{ | |
if(typeof obj[attr] != "function") | |
{ | |
vobj.push('"' + attr + '":' + serialize(obj[attr])); | |
} | |
} | |
if(vobj.length >0) | |
return "{" + vobj.join(",") + "}"; | |
else | |
return "{}"; | |
} | |
else | |
{ | |
return obj.toString(); | |
} | |
} | |
} | |
return null; | |
}, | |
createId = function (){ | |
var id = 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { | |
var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8); | |
return v.toString(16); | |
}); | |
return id; | |
}, | |
onConsoleMessage = function (msg, line, source) { | |
console.log('console> ' + msg); | |
}; | |
return{ | |
quit: quit, | |
setGubbins: setGubbins, | |
getContent: getContent | |
}; | |
}(); | |
// Gubbins | |
////////////////////// | |
var asosDress = { | |
manipulate: function(subPage){ | |
// do all ajaxy things here | |
// trigger the drop down | |
console.log('manipulating page...'); | |
subPage.evaluate(function(){ | |
$('#ssMoreLink').click(); | |
var colourDrop = $('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour'); | |
if (colourDrop.attr('disabled') != 'disabled'){ | |
var colour = $('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour').children()[1].text; | |
$('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour').val(colour); | |
$('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour').change(); | |
} | |
}); | |
}, | |
get: function(subPage, onComplete){ | |
var self = this; | |
// collect the data | |
console.log('scraping page...'); | |
var dress = subPage.evaluate(function(){ | |
var colours = $('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnColour').children(); | |
console.log(colours.length); | |
colours.splice(0, 1); | |
var colourText = new Array(); | |
for(var i = 0; i < colours.length; i++){ | |
colourText.push(colours[i].text); | |
} | |
var sizes = $('#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnSize').children(); | |
sizes.splice(0, 1); | |
var sizeText = new Array(); | |
for(var j = 0; j < sizes.length; j++){ | |
sizeText.push(sizes[j].text); | |
} | |
var images = $('img', '.productImagesItems'); | |
var imageUrls = new Array(); | |
for(var i = 0; i < images.length; i++){ | |
var src = $(images[i]).attr('src'); | |
imageUrls.push(src); | |
} | |
return { | |
id: $('#ctl00_ContentMainPage_ctlSeparateProduct_hdnSku').val().trim(), | |
title: $('#ctl00_ContentMainPage_ctlSeparateProduct_lblProductTitle').text(), | |
price: $('#ctl00_ContentMainPage_ctlSeparateProduct_lblProductPrice').text(), | |
description: $('.single-entry').text(), | |
sizes: sizeText, | |
colours: colourText, | |
images: imageUrls, | |
url: window.location.href | |
}; | |
}); | |
dress.retailerId = self.retailerId; | |
if(onComplete){ | |
onComplete(dress); | |
} | |
}, | |
retailerId: 12345 | |
}; | |
// Execution | |
////////////////////// | |
try{ | |
Scraper.setGubbins(asosDress); | |
var system = require('system'); | |
Scraper.getContent(system.args[1]); | |
} | |
catch (ex){ | |
console.log('phantomjs ex: ' + ex); | |
Scraper.quit(); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment