admariner · May 31, 2020 06:14
diff --git a/.gitignore b/.gitignore
 node_modules
 build
 *.min.js
diff --git a/README.md b/README.md
diff --git a/gulpfile.js b/gulpfile.js
 var gulp = require('gulp'),
    clipboard = require('gulp-clipboard'),
    uglify = require('gulp-uglify'),
    rename = require('gulp-rename'),
    artoo = require('gulp-artoo');

 gulp.task('default', function() {
  return gulp.src('./index.js')
    .pipe(uglify())
    .pipe(rename('hacker_news.bookmark.js'))
    .pipe(artoo())
    .pipe(clipboard())
    .pipe(gulp.dest('./build'));
 });
diff --git a/index.js b/index.js
 ;(function($, undefined) {

  // Specifications to scrape one page's posts
  var scraper = {

    // We iterate on Hacker News posts
    iterator: 'tr tr:has(td.title:has(a)):not(:last)',

    // The following object represent the data we want to retrieve.
    // The scrape method, as a lot of artoo's methods, is really polymorphic
    // and the same thing may be expressed in a great variety of ways.
    // Just use the way that fit your coding style the most.
    data: {

      // For the title, a simple subselector suffice (the text of the element is taken by default)
      title: {sel: '.title a'},

      // Same for the url, except that we request the 'href' attribute
      url: {sel: '.title a', attr: 'href'},

      // Following are more tricky as we need to process data a little bit
      domain: {

        // The sel parameter here is the same as $(currentIteratedEl).find('.comhead')
        sel: '.comhead',
        method: function($) {

          // $(this) is therefore $(currentIteratedEl).find('.comhead')
          // artoo follows jQuery paradigm whenever he can
          return $(this).text().trim().replace(/[\(\)]/g, '');
        }
      },

      // But if you prefer to use a function, right away, help yourself
      score: function($) {
        return +$(this).find('+ tr [id^=score]').text().replace(' points', '');
      },

      // Note that the 'method' function takes artoo's jquery reference as argument.
      // This is made so you can access your desired version of jQuery without having to force it
      // to the global scope.
      user: {
        sel: '+ tr a[href^=user]',
        method: function($) {
          return $(this).length ? $(this).text() : null;
        }
      },
      nb_comments: {
        sel: '+ tr a[href^=item]',
        method: function($) {
          var nb = +$(this).text().replace(' comments', '');
          return isNaN(nb) ? 0 : nb;
        }
      }
    }
  };

  // Fonction to retrieve next page's url
  function nextUrl($page) {
    return $page.find('td.title:last > a').attr('href');
  }

  // We start the scraper and scrape the first page so we don't need to
  // get by ajax what we already have
  artoo.log.debug('Starting the scraper...');
  var frontpage = artoo.scrape(scraper);

  // Then we launch the ajax spider
  artoo.ajaxSpider(

    // This function is an iterator that returns the next page url
    // It stops the spider if it returns false, else you'll need a limit param
    function(i, $data) {
      return nextUrl(!i ? $(document) : $data);
    },

    // This is a configuration object passed to the spider
    {

      // We only want to fetch two more pages, to total three with the first one.
      limit: 2,

      // We want to scrape the HTML retrieved by ajax
      scrape: scraper,

      // We want to concat new elements in the spider's accumulator so we have
      // a flat list at the end
      concat: true,

      // This is the final callback of the spider
      // We tell the user that the wait is over and we download the data
      done: function(data) {
        artoo.log.debug('Finished retrieving data. Downloading...');
        artoo.savePrettyJson(
          frontpage.concat(data),
          {filename: 'hacker_news.json'}
        );
      }
    }
  );
 }).call(this, artoo.$);
diff --git a/package.json b/package.json
 {
  "name": "hackernews-scraper",
  "version": "0.1.0",
  "description": "A little artoo.js bookmarklet to scrape and download the first three pages of the famous Hacker News.",
  "main": "index.js",
  "author": "Yomguithereal",
  "license": "MIT",
  "dependencies": {
    "gulp": "~3.8.7",
    "gulp-uglify": "~0.3.1",
    "gulp-artoo": "0.0.1",
    "gulp-clipboard": "~0.1.1",
    "gulp-rename": "~1.2.0"
  }
 }
	var gulp = require('gulp'),
	clipboard = require('gulp-clipboard'),
	uglify = require('gulp-uglify'),
	rename = require('gulp-rename'),
	artoo = require('gulp-artoo');

	gulp.task('default', function() {
	return gulp.src('./index.js')
	.pipe(uglify())
	.pipe(rename('hacker_news.bookmark.js'))
	.pipe(artoo())
	.pipe(clipboard())
	.pipe(gulp.dest('./build'));
	});
	;(function($, undefined) {

	// Specifications to scrape one page's posts
	var scraper = {

	// We iterate on Hacker News posts
	iterator: 'tr tr:has(td.title:has(a)):not(:last)',

	// The following object represent the data we want to retrieve.
	// The scrape method, as a lot of artoo's methods, is really polymorphic
	// and the same thing may be expressed in a great variety of ways.
	// Just use the way that fit your coding style the most.
	data: {

	// For the title, a simple subselector suffice (the text of the element is taken by default)
	title: {sel: '.title a'},

	// Same for the url, except that we request the 'href' attribute
	url: {sel: '.title a', attr: 'href'},

	// Following are more tricky as we need to process data a little bit
	domain: {

	// The sel parameter here is the same as $(currentIteratedEl).find('.comhead')
	sel: '.comhead',
	method: function($) {

	// $(this) is therefore $(currentIteratedEl).find('.comhead')
	// artoo follows jQuery paradigm whenever he can
	return $(this).text().trim().replace(/[\(\)]/g, '');
	}
	},

	// But if you prefer to use a function, right away, help yourself
	score: function($) {
	return +$(this).find('+ tr [id^=score]').text().replace(' points', '');
	},

	// Note that the 'method' function takes artoo's jquery reference as argument.
	// This is made so you can access your desired version of jQuery without having to force it
	// to the global scope.
	user: {
	sel: '+ tr a[href^=user]',
	method: function($) {
	return $(this).length ? $(this).text() : null;
	}
	},
	nb_comments: {
	sel: '+ tr a[href^=item]',
	method: function($) {
	var nb = +$(this).text().replace(' comments', '');
	return isNaN(nb) ? 0 : nb;
	}
	}
	}
	};

	// Fonction to retrieve next page's url
	function nextUrl($page) {
	return $page.find('td.title:last > a').attr('href');
	}

	// We start the scraper and scrape the first page so we don't need to
	// get by ajax what we already have
	artoo.log.debug('Starting the scraper...');
	var frontpage = artoo.scrape(scraper);

	// Then we launch the ajax spider
	artoo.ajaxSpider(

	// This function is an iterator that returns the next page url
	// It stops the spider if it returns false, else you'll need a limit param
	function(i, $data) {
	return nextUrl(!i ? $(document) : $data);
	},

	// This is a configuration object passed to the spider
	{

	// We only want to fetch two more pages, to total three with the first one.
	limit: 2,

	// We want to scrape the HTML retrieved by ajax
	scrape: scraper,

	// We want to concat new elements in the spider's accumulator so we have
	// a flat list at the end
	concat: true,

	// This is the final callback of the spider
	// We tell the user that the wait is over and we download the data
	done: function(data) {
	artoo.log.debug('Finished retrieving data. Downloading...');
	artoo.savePrettyJson(
	frontpage.concat(data),
	{filename: 'hacker_news.json'}
	);
	}
	}
	);
	}).call(this, artoo.$);
	{
	"name": "hackernews-scraper",
	"version": "0.1.0",
	"description": "A little artoo.js bookmarklet to scrape and download the first three pages of the famous Hacker News.",
	"main": "index.js",
	"author": "Yomguithereal",
	"license": "MIT",
	"dependencies": {
	"gulp": "~3.8.7",
	"gulp-uglify": "~0.3.1",
	"gulp-artoo": "0.0.1",
	"gulp-clipboard": "~0.1.1",
	"gulp-rename": "~1.2.0"
	}
	}