|
;(function($, undefined) { |
|
|
|
// Specifications to scrape one page's posts |
|
var scraper = { |
|
|
|
// We iterate on Hacker News posts |
|
iterator: 'tr tr:has(td.title:has(a)):not(:last)', |
|
|
|
// The following object represent the data we want to retrieve. |
|
// The scrape method, as a lot of artoo's methods, is really polymorphic |
|
// and the same thing may be expressed in a great variety of ways. |
|
// Just use the way that fit your coding style the most. |
|
data: { |
|
|
|
// For the title, a simple subselector suffice (the text of the element is taken by default) |
|
title: {sel: '.title a'}, |
|
|
|
// Same for the url, except that we request the 'href' attribute |
|
url: {sel: '.title a', attr: 'href'}, |
|
|
|
// Following are more tricky as we need to process data a little bit |
|
domain: { |
|
|
|
// The sel parameter here is the same as $(currentIteratedEl).find('.comhead') |
|
sel: '.comhead', |
|
method: function($) { |
|
|
|
// $(this) is therefore $(currentIteratedEl).find('.comhead') |
|
// artoo follows jQuery paradigm whenever he can |
|
return $(this).text().trim().replace(/[\(\)]/g, ''); |
|
} |
|
}, |
|
|
|
// But if you prefer to use a function, right away, help yourself |
|
score: function($) { |
|
return +$(this).find('+ tr [id^=score]').text().replace(' points', ''); |
|
}, |
|
|
|
// Note that the 'method' function takes artoo's jquery reference as argument. |
|
// This is made so you can access your desired version of jQuery without having to force it |
|
// to the global scope. |
|
user: { |
|
sel: '+ tr a[href^=user]', |
|
method: function($) { |
|
return $(this).length ? $(this).text() : null; |
|
} |
|
}, |
|
nb_comments: { |
|
sel: '+ tr a[href^=item]', |
|
method: function($) { |
|
var nb = +$(this).text().replace(' comments', ''); |
|
return isNaN(nb) ? 0 : nb; |
|
} |
|
} |
|
} |
|
}; |
|
|
|
// Fonction to retrieve next page's url |
|
function nextUrl($page) { |
|
return $page.find('td.title:last > a').attr('href'); |
|
} |
|
|
|
// We start the scraper and scrape the first page so we don't need to |
|
// get by ajax what we already have |
|
artoo.log.debug('Starting the scraper...'); |
|
var frontpage = artoo.scrape(scraper); |
|
|
|
// Then we launch the ajax spider |
|
artoo.ajaxSpider( |
|
|
|
// This function is an iterator that returns the next page url |
|
// It stops the spider if it returns false, else you'll need a limit param |
|
function(i, $data) { |
|
return nextUrl(!i ? $(document) : $data); |
|
}, |
|
|
|
// This is a configuration object passed to the spider |
|
{ |
|
|
|
// We only want to fetch two more pages, to total three with the first one. |
|
limit: 2, |
|
|
|
// We want to scrape the HTML retrieved by ajax |
|
scrape: scraper, |
|
|
|
// We want to concat new elements in the spider's accumulator so we have |
|
// a flat list at the end |
|
concat: true, |
|
|
|
// This is the final callback of the spider |
|
// We tell the user that the wait is over and we download the data |
|
done: function(data) { |
|
artoo.log.debug('Finished retrieving data. Downloading...'); |
|
artoo.savePrettyJson( |
|
frontpage.concat(data), |
|
{filename: 'hacker_news.json'} |
|
); |
|
} |
|
} |
|
); |
|
}).call(this, artoo.$); |