A scrapper based on nodejs, written in coffeescript and using cheerio and request
packages
This is an equivalent of the pjscrape exemple but really faster: http://nrabinowitz.github.com/pjscrape/#overview
A scrapper based on nodejs, written in coffeescript and using cheerio and request
packages
This is an equivalent of the pjscrape exemple but really faster: http://nrabinowitz.github.com/pjscrape/#overview
| { | |
| "author": "", | |
| "name": "scrapping", | |
| "version": "0.0.0", | |
| "repository": { | |
| "url": "" | |
| }, | |
| "engines": { | |
| "node": "~0.4.12" | |
| }, | |
| "dependencies": { | |
| "request": "2.2.x", | |
| "cheerio": "0.2.x", | |
| "colors": "0.5.x", | |
| "underscore": "1.2.x", | |
| "ent": "0.0.x" | |
| }, | |
| "devDependencies": {} | |
| } |
| request = require 'request' | |
| jsdom = require 'jsdom' | |
| cheerio = require 'cheerio' | |
| sys = require 'sys' | |
| colors = require 'colors' | |
| _ = require 'underscore' | |
| ent = require 'ent' | |
| handleError = (err) -> | |
| if err | |
| err = err.message if err.message | |
| err = err.statusCode if err.statusCode | |
| console.error 'Error', err.red | |
| process.exit -1 | |
| scrapp = (url, cb)-> | |
| cb or= handleError | |
| console.log '>', url | |
| request url: url, proxy: process.env['http_proxy'], (err, resp, body) -> | |
| return cb err if err | |
| return cb resp if resp.statusCode != 200 | |
| $ = cheerio.load body | |
| cb null, $, url | |
| scrapp 'http://en.wikipedia.org/wiki/List_of_towns_in_Vermont', (err, $, url)-> | |
| return handleError err if err | |
| [protocol, n,host] = url.split('/') | |
| baseUrl = "#{protocol}//#{host}" | |
| moreUrl = _.values($('td a')).map (it)-> it.attribs.href | |
| moreUrl = _.filter moreUrl, (it)-> it.indexOf('http') is -1 | |
| moreUrl = moreUrl.map (it)-> baseUrl + it | |
| moreUrl.forEach (url)-> scrapp url, (err, $, url)-> | |
| return console.error err if err | |
| elevation = '' | |
| $('td').each (i, elem)-> | |
| elevation = $(elem).siblings().text() if $(elem).text().indexOf('Elevation') != -1 | |
| console.log name: $('#firstHeading').text(), elevation: ent.decode elevation |