Skip to content

Instantly share code, notes, and snippets.

@qcom
Created July 10, 2013 19:20
Show Gist options
  • Save qcom/5969334 to your computer and use it in GitHub Desktop.
Save qcom/5969334 to your computer and use it in GitHub Desktop.
var fs = require('fs');
var casper = require('./util/augment/extend')(require('./util/augment/config')(require('casper').create({
clientScripts: ['includes/jquery-2.0.1.min.js'],
pageSettings: {
loadPlugins: false
}
})));
var mans = require('./mans');
var keys = Object.keys(mans);
var keyIndex = 0;
var sumIn = require('./util/sumIn');
casper.cleanKey = function(s) {
s = s.replace(':', '').replace('.', '').trim();
var words = s.split(' ');
var result = '';
for (var i = 0; i < words.length; i++) {
if (i === 0) {
result += words[i].toLowerCase();
} else {
result += words[i];
}
}
return result;
};
casper.cleanValue = function(s) {
return s.trim();
};
casper.scrapeCategory = function(domain, cleanKey, cleanValue, obj, pageIndex, manKey) {
var products = [];
$('.productListing').each(function(i) {
var $row = $(this);
// var $image = $row.find('.desc img.prodImg');
var $main = $row.find('.desc2 .descInner');
var $man = $main.find('.manufacName');
var $product = $main.find('.prodName a');
var $sku = $main.find('.skuID');
// var $details = $row.find('.details li');
// var $uom = $row.find('.uom');
var p = {};
/*if ($image.attr('alt').trim() !== '')
p.searchImageAlt = $image.attr('alt');
p.searchImageUrl = domain + $image.attr('src');*/
if ($man.find('a').length > 0)
p.manUrl = $man.find('a').attr('href');
p.manImageUrl = domain + $man.find('img').attr('src');
p.url = domain + $product.attr('href');
p.name = $product.text();
p.sku = $sku.text().replace('SKU:', '').trim();
/*$($details).each(function(i) {
var k = $(this).find('strong').text();
var v = $(this).text().replace(k, '');
p.searchDetails[cleanKey(k)] = cleanValue(v);
});
p.searchDetails.orderUOM = $uom.find('strong').text();
var sp = $uom.text().replace(p.orderUOM, '');
if (sp !== '')
p.searchDetails.standardPack = sp;*/
// obj properties
p.crumbs = obj.crumbs;
p.fullCrumbs = obj.fullCrumbs;
p.path = obj.path;
p.manName = manKey;
// misc
p.pageNumber = pageIndex + 1;
p.numberOnPage = i + 1;
products.push(p);
});
/*$('.prodName a').each(function(i) {
var $product = $(this);
var $parent = $product.parent();
var $man = $parent.prev().children().first();
products.push({
sku: $parent.next().text().replace('SKU:', '').trim(),
name: $product.text(),
manName: $man.children().first().attr('alt'),
manUrl: $man.attr('href'),
manImage: $man.children().first().attr('src'),
});
});*/
return products;
};
casper.scrapeProduct = function(p, cleanKey, cleanValue, domain) {
var $img = $('.productDetailLeft td:first img');
if ($img.length > 0) {
p.imageUrl = domain + $img.attr('src');
p.imageAlt = $img.attr('alt');
}
var $sku = $('.skuID');
if ($sku.length > 0) {
var skuTexts = $sku.text().trim().split(':');
var skuIndex;
for (var i = 0; i < skuTexts.length; i++)
if (skuTexts[i].toLowerCase().indexOf('prefix') !== -1)
skuIndex = i;
if (skuIndex)
p.skuPrefix = skuTexts[skuIndex].toLowerCase().replace('prefix', '').trim();
}
var $features = $('.product-features li');
if ($features.length > 0) {
p.features = [];
$($features).each(function() {
p.features.push($(this).text());
});
}
var $formerSecondarySpecs = $('.item-details-wrapper tr');
if ($formerSecondarySpecs.length > 0) {
/*if (!p.secondarySpecs)
p.secondarySpecs = {};*/
$($formerSecondarySpecs).each(function() {
var k = $(this).find('td.label').text();
var v = $(this).find('td:not(.label)').text();
p[cleanKey(k)] = cleanValue(v);
});
}
var $latterSecondarySpecs = $('.pricing-availability-pod td[valign="middle"]:not(:has(input)):not(:has(a))');
if ($latterSecondarySpecs.length > 0) {
/*if (!p.secondarySpecs)
p.secondarySpecs = {};*/
$($latterSecondarySpecs).each(function() {
var a = $(this).text().split(':');
var k = a[0];
var v = a[1];
p[cleanKey(k)] = cleanValue(v);
});
}
var $specs = $('.productdata-pod tr:not(".colheader")');
if ($specs.length > 0) {
// p.specs = {};
$specs.each(function() {
var k = $(this).find('.bd').text();
var v = $(this).find('.desc').text();
p[cleanKey(k)] = cleanValue(v);
});
}
return p;
};
casper.getNext = function() {
var files = fs.list('./output').slice(2);
// create array for manufacturers left to scrape
var todo = [];
// read manufacturers from previous scrape
var mans = JSON.parse(fs.read('/home/zach/nasco/products/mans.json'));
// iterate over each manufacturer
Object.keys(mans).forEach(function(manKey) {
// store .json'd version of the manufacturer name
var file = manKey + '.json';
// check whether or not the filename-version of current manufacturer
// is included in ./output/ (the directory with scraped data)
if (files.indexOf(file) !== -1) {
// if so, then set up some variables
// grab the scraped data from the manufacturer's data file in ./output/
var man = regularRequire('/home/zach/nasco/products/output/' + file);
// get the number of products scraped for the current manufacturer
var scrapeCount = man.length;
// count the total number of products assigned to the current manufacturer
var totalCount = sumIn(mans[manKey]);
// check whether or not the scrape has scraped all of the manufacturer's products
if (scrapeCount !== totalCount) {
// if the scrape has not, shove it in todo
todo.push(manKey);
}
} else {
// if there is no scraped data file, well of course it belongs in todo
todo.push(manKey);
}
});
// either return the first manufacturer that needs to be scraped
if (todo.length > 0)
return todo[0];
// or return null if the scrape has completed
return null;
};
/*** Start ***/
casper.start(casper.index);
/*** Steps ***/
casper.then(casper.login);
casper.label('scrape');
casper.then(function() {
console.log(keyIndex);
var manKey = keys[keyIndex];
// cache current man during loop over keys
var man = mans[manKey];
// point casper to man's url, as category and product urls can only be properly navigated to
// only after first visiting the manufacturer's url
// (otherwise, in the case of the category, all product's will be shown that belong to that category,
// instead of only the products that belong to that category AND that manufacturer, which is the desired effect)
this.thenOpen(man.url, function() {
// log current manufacture's name via current iteration's loop variable
this.echo('\n' + manKey);
// create array for all of the products in the current manufacturer
var manProducts = []
// cache current value of this, which is equal to the global casper object
var that = this;
// enter recursive function that drills down to each subcategory's deepest point
// generalize param name to `obj` since `man` would technically be incorrect after first recurse
(function r(obj) {
// check if there is at least another level to recurse via the object's more property (either true or false)
if (obj.more) {
// if there is another level, iterate over each of the keys on the products property of obj
// the products property of obj is an object that will contain a key for each next-level-deep branch
Object.keys(obj.products).forEach(function(key) {
// call recursive function r with each next-level-deep object
r(obj.products[key]);
});
} else {
// if there is not another level, then the deepest category in this branch has been reached
// and now each page in obj.pages must be iterated over, opened, and scraped
// declare pageIndex param for the category-level scrape (specifically, the pageNumber property on each product)
that.each(obj.pages, function(self, page, pageIndex) {
// point casper to current page's url for the first, category-level scrape
that.thenOpen(page.url, function() {
// set products variable equal to an array of product data from the scrapeCategory function
// this var will only contain data found on the category pages, as each product page has not yet been visited
var products = that.evaluate(that.scrapeCategory, that.domain, that.cleanKey, that.cleanValue, obj, pageIndex, manKey);
// now, since the preliminary data has been collected from the category page,
// each product in products must be iterated over
that.each(products, function(self, product) {
// and casper is pointed to each product's url
that.thenOpen(product.url, function() {
// augment current `product` with data collected from current `product` page data
// this will add all of the features, more detailed specifications, etc.
product = that.evaluate(that.scrapeProduct, product, that.cleanKey, that.cleanValue, that.domain);
// add current `product` to array of products for the current manufacturer iteration
manProducts.push(product);
// write to the manufacturer's data file with stringified json of `manProducts`
fs.write('./output/' + manKey + '.json', JSON.stringify(manProducts));
// log current product's sku to signal the end of current product scrape
keyIndex += 1;
that.echo(product.sku);
});
});
// after each product has been scraped for the current page, prepare a navigation step
// for logging the crumbs and page number the scraper is currently processing
that.then(function() {
that.echo(obj.crumbs + ' ' + (pageIndex + 1));
});
});
});
}
}(man));
});
});
casper.then(function() {
this.goto('scrape');
});
/*** End ***/
casper.run(function() { this.exit(); });
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment