Created
July 10, 2013 19:20
-
-
Save qcom/5969334 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var casper = require('./util/augment/extend')(require('./util/augment/config')(require('casper').create({ | |
clientScripts: ['includes/jquery-2.0.1.min.js'], | |
pageSettings: { | |
loadPlugins: false | |
} | |
}))); | |
var mans = require('./mans'); | |
var keys = Object.keys(mans); | |
var keyIndex = 0; | |
var sumIn = require('./util/sumIn'); | |
casper.cleanKey = function(s) { | |
s = s.replace(':', '').replace('.', '').trim(); | |
var words = s.split(' '); | |
var result = ''; | |
for (var i = 0; i < words.length; i++) { | |
if (i === 0) { | |
result += words[i].toLowerCase(); | |
} else { | |
result += words[i]; | |
} | |
} | |
return result; | |
}; | |
casper.cleanValue = function(s) { | |
return s.trim(); | |
}; | |
casper.scrapeCategory = function(domain, cleanKey, cleanValue, obj, pageIndex, manKey) { | |
var products = []; | |
$('.productListing').each(function(i) { | |
var $row = $(this); | |
// var $image = $row.find('.desc img.prodImg'); | |
var $main = $row.find('.desc2 .descInner'); | |
var $man = $main.find('.manufacName'); | |
var $product = $main.find('.prodName a'); | |
var $sku = $main.find('.skuID'); | |
// var $details = $row.find('.details li'); | |
// var $uom = $row.find('.uom'); | |
var p = {}; | |
/*if ($image.attr('alt').trim() !== '') | |
p.searchImageAlt = $image.attr('alt'); | |
p.searchImageUrl = domain + $image.attr('src');*/ | |
if ($man.find('a').length > 0) | |
p.manUrl = $man.find('a').attr('href'); | |
p.manImageUrl = domain + $man.find('img').attr('src'); | |
p.url = domain + $product.attr('href'); | |
p.name = $product.text(); | |
p.sku = $sku.text().replace('SKU:', '').trim(); | |
/*$($details).each(function(i) { | |
var k = $(this).find('strong').text(); | |
var v = $(this).text().replace(k, ''); | |
p.searchDetails[cleanKey(k)] = cleanValue(v); | |
}); | |
p.searchDetails.orderUOM = $uom.find('strong').text(); | |
var sp = $uom.text().replace(p.orderUOM, ''); | |
if (sp !== '') | |
p.searchDetails.standardPack = sp;*/ | |
// obj properties | |
p.crumbs = obj.crumbs; | |
p.fullCrumbs = obj.fullCrumbs; | |
p.path = obj.path; | |
p.manName = manKey; | |
// misc | |
p.pageNumber = pageIndex + 1; | |
p.numberOnPage = i + 1; | |
products.push(p); | |
}); | |
/*$('.prodName a').each(function(i) { | |
var $product = $(this); | |
var $parent = $product.parent(); | |
var $man = $parent.prev().children().first(); | |
products.push({ | |
sku: $parent.next().text().replace('SKU:', '').trim(), | |
name: $product.text(), | |
manName: $man.children().first().attr('alt'), | |
manUrl: $man.attr('href'), | |
manImage: $man.children().first().attr('src'), | |
}); | |
});*/ | |
return products; | |
}; | |
casper.scrapeProduct = function(p, cleanKey, cleanValue, domain) { | |
var $img = $('.productDetailLeft td:first img'); | |
if ($img.length > 0) { | |
p.imageUrl = domain + $img.attr('src'); | |
p.imageAlt = $img.attr('alt'); | |
} | |
var $sku = $('.skuID'); | |
if ($sku.length > 0) { | |
var skuTexts = $sku.text().trim().split(':'); | |
var skuIndex; | |
for (var i = 0; i < skuTexts.length; i++) | |
if (skuTexts[i].toLowerCase().indexOf('prefix') !== -1) | |
skuIndex = i; | |
if (skuIndex) | |
p.skuPrefix = skuTexts[skuIndex].toLowerCase().replace('prefix', '').trim(); | |
} | |
var $features = $('.product-features li'); | |
if ($features.length > 0) { | |
p.features = []; | |
$($features).each(function() { | |
p.features.push($(this).text()); | |
}); | |
} | |
var $formerSecondarySpecs = $('.item-details-wrapper tr'); | |
if ($formerSecondarySpecs.length > 0) { | |
/*if (!p.secondarySpecs) | |
p.secondarySpecs = {};*/ | |
$($formerSecondarySpecs).each(function() { | |
var k = $(this).find('td.label').text(); | |
var v = $(this).find('td:not(.label)').text(); | |
p[cleanKey(k)] = cleanValue(v); | |
}); | |
} | |
var $latterSecondarySpecs = $('.pricing-availability-pod td[valign="middle"]:not(:has(input)):not(:has(a))'); | |
if ($latterSecondarySpecs.length > 0) { | |
/*if (!p.secondarySpecs) | |
p.secondarySpecs = {};*/ | |
$($latterSecondarySpecs).each(function() { | |
var a = $(this).text().split(':'); | |
var k = a[0]; | |
var v = a[1]; | |
p[cleanKey(k)] = cleanValue(v); | |
}); | |
} | |
var $specs = $('.productdata-pod tr:not(".colheader")'); | |
if ($specs.length > 0) { | |
// p.specs = {}; | |
$specs.each(function() { | |
var k = $(this).find('.bd').text(); | |
var v = $(this).find('.desc').text(); | |
p[cleanKey(k)] = cleanValue(v); | |
}); | |
} | |
return p; | |
}; | |
casper.getNext = function() { | |
var files = fs.list('./output').slice(2); | |
// create array for manufacturers left to scrape | |
var todo = []; | |
// read manufacturers from previous scrape | |
var mans = JSON.parse(fs.read('/home/zach/nasco/products/mans.json')); | |
// iterate over each manufacturer | |
Object.keys(mans).forEach(function(manKey) { | |
// store .json'd version of the manufacturer name | |
var file = manKey + '.json'; | |
// check whether or not the filename-version of current manufacturer | |
// is included in ./output/ (the directory with scraped data) | |
if (files.indexOf(file) !== -1) { | |
// if so, then set up some variables | |
// grab the scraped data from the manufacturer's data file in ./output/ | |
var man = regularRequire('/home/zach/nasco/products/output/' + file); | |
// get the number of products scraped for the current manufacturer | |
var scrapeCount = man.length; | |
// count the total number of products assigned to the current manufacturer | |
var totalCount = sumIn(mans[manKey]); | |
// check whether or not the scrape has scraped all of the manufacturer's products | |
if (scrapeCount !== totalCount) { | |
// if the scrape has not, shove it in todo | |
todo.push(manKey); | |
} | |
} else { | |
// if there is no scraped data file, well of course it belongs in todo | |
todo.push(manKey); | |
} | |
}); | |
// either return the first manufacturer that needs to be scraped | |
if (todo.length > 0) | |
return todo[0]; | |
// or return null if the scrape has completed | |
return null; | |
}; | |
/*** Start ***/ | |
casper.start(casper.index); | |
/*** Steps ***/ | |
casper.then(casper.login); | |
casper.label('scrape'); | |
casper.then(function() { | |
console.log(keyIndex); | |
var manKey = keys[keyIndex]; | |
// cache current man during loop over keys | |
var man = mans[manKey]; | |
// point casper to man's url, as category and product urls can only be properly navigated to | |
// only after first visiting the manufacturer's url | |
// (otherwise, in the case of the category, all product's will be shown that belong to that category, | |
// instead of only the products that belong to that category AND that manufacturer, which is the desired effect) | |
this.thenOpen(man.url, function() { | |
// log current manufacture's name via current iteration's loop variable | |
this.echo('\n' + manKey); | |
// create array for all of the products in the current manufacturer | |
var manProducts = [] | |
// cache current value of this, which is equal to the global casper object | |
var that = this; | |
// enter recursive function that drills down to each subcategory's deepest point | |
// generalize param name to `obj` since `man` would technically be incorrect after first recurse | |
(function r(obj) { | |
// check if there is at least another level to recurse via the object's more property (either true or false) | |
if (obj.more) { | |
// if there is another level, iterate over each of the keys on the products property of obj | |
// the products property of obj is an object that will contain a key for each next-level-deep branch | |
Object.keys(obj.products).forEach(function(key) { | |
// call recursive function r with each next-level-deep object | |
r(obj.products[key]); | |
}); | |
} else { | |
// if there is not another level, then the deepest category in this branch has been reached | |
// and now each page in obj.pages must be iterated over, opened, and scraped | |
// declare pageIndex param for the category-level scrape (specifically, the pageNumber property on each product) | |
that.each(obj.pages, function(self, page, pageIndex) { | |
// point casper to current page's url for the first, category-level scrape | |
that.thenOpen(page.url, function() { | |
// set products variable equal to an array of product data from the scrapeCategory function | |
// this var will only contain data found on the category pages, as each product page has not yet been visited | |
var products = that.evaluate(that.scrapeCategory, that.domain, that.cleanKey, that.cleanValue, obj, pageIndex, manKey); | |
// now, since the preliminary data has been collected from the category page, | |
// each product in products must be iterated over | |
that.each(products, function(self, product) { | |
// and casper is pointed to each product's url | |
that.thenOpen(product.url, function() { | |
// augment current `product` with data collected from current `product` page data | |
// this will add all of the features, more detailed specifications, etc. | |
product = that.evaluate(that.scrapeProduct, product, that.cleanKey, that.cleanValue, that.domain); | |
// add current `product` to array of products for the current manufacturer iteration | |
manProducts.push(product); | |
// write to the manufacturer's data file with stringified json of `manProducts` | |
fs.write('./output/' + manKey + '.json', JSON.stringify(manProducts)); | |
// log current product's sku to signal the end of current product scrape | |
keyIndex += 1; | |
that.echo(product.sku); | |
}); | |
}); | |
// after each product has been scraped for the current page, prepare a navigation step | |
// for logging the crumbs and page number the scraper is currently processing | |
that.then(function() { | |
that.echo(obj.crumbs + ' ' + (pageIndex + 1)); | |
}); | |
}); | |
}); | |
} | |
}(man)); | |
}); | |
}); | |
casper.then(function() { | |
this.goto('scrape'); | |
}); | |
/*** End ***/ | |
casper.run(function() { this.exit(); }); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment