A scrapper based on nodejs, written in coffeescript and using cheerio and request
packages
This is an equivalent of the pjscrape exemple but really faster:
| # Bus 0.1(alpha) | |
| # (c) 2010 John Wright, QuickLeft Inc. | |
| # Bus may be freely distributed under the MIT license. | |
| # For all details and documentation: | |
| # http://github.com/mrjjwright/Bus | |
| # | |
| # | |
| # | |
| # Bus would not be possible without Jeremy Ashkenas who wrote CoffeeScript, the language |
| methodMap = { | |
| 'create': 'POST', | |
| 'update': 'PUT', | |
| 'delete': 'DELETE', | |
| 'read': 'GET' | |
| }; | |
| getUrl = function(object) { | |
| if (!(object && object.url)) { | |
| throw new Error("A 'url' property or function must be specified"); | |
| } else { |
| require "nosqlite" | |
| require "Math.uuid" | |
| convert_callback: (row) -> | |
| if not row.guid? | |
| row.guid: Math.uuidFast(); | |
| return row; | |
| db_file: "my_db.sqlite" |
| require.paths.unshift("/Users/johnw/js/node_modules") | |
| require("./underscore") | |
| sys: require("sys") | |
| rest: require("restler") | |
| rest.get('http://github.com/api/v2/json/repos/show/mrjjwright').addListener('complete', | |
| ((data) -> | |
| repositories: JSON.parse(data).repositories | |
| for repository in repositories |
| cheerio = require('cheerio') | |
| Shred = require('shred') | |
| shred = new Shred() | |
| http = require('http') | |
| URL = require('url') | |
| server = http.createServer (request, response) -> | |
| url = URL.parse(request.url, true) | |
| urlToDiscover = url.query['url'] | |
| startDiscovery urlToDiscover, (theImageURL) -> |
| var fs = require('fs'), | |
| async = require('async'); | |
| var try_series = function(func_name, func, data, times, cb){ | |
| var start = new Date(); | |
| var tries = new Array(times); | |
| for(var i = 0 ; i < times ; i++) { | |
| tries[i] = function(callback){func(data, callback);}; | |
| } | |
| async.series(tries, function(err,result){ |
| var cheerio = require('cheerio') | |
| var request = require('request') | |
| var pictureTube = require('picture-tube') | |
| var url = require('url') | |
| var async = require('async') | |
| var site = process.argv[2] | |
| console.log('fetching', site) | |
| request(site, function(e,r,b) { |
| var http = require('http'), | |
| https = require('https'), | |
| Iconv = require('iconv').Iconv, | |
| iconv = new Iconv('EUC-JP', 'UTF-8//TRANSLIT//IGNORE'), | |
| cheerio = require('cheerio'), | |
| request = require('request'); | |
| var site = 'http://www.hit.ac.jp/gakusei/chgschool/', | |
| port = 8880; |
| var sip = require('sip'); | |
| var sys = require('sys'); | |
| var redis = require('redis'); | |
| //Trim leading and trailing whitespace from string values. | |
| function trim(str) { | |
| return str.replace(/^\s+|\s+$/g, ''); | |
| } | |
| sip.start({},function(request) { |