Created
May 2, 2011 18:39
-
-
Save kasperlanger/952114 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env coffee | |
# `crawler.coffee` - a simple http crawler recording the timings of GET request. | |
# Require http, jquery and underscore | |
http = require('http') | |
$ = require('jquery') | |
_ = require('underscore') | |
# Set options for http client | |
auth = 'Basic ' + new Buffer('demo2:demo').toString('base64') | |
options = | |
host: 'localhost' | |
port: 8080 | |
headers: Authorization: auth | |
# Initialize task tracking variables | |
visited = {'/view/auth/logout': 'blacklist'} | |
results = [] | |
queue = [] | |
# `padl` - takes a string and optional `length` option. Returns a string left-padded with spaces. | |
padl = (str, length = 5) -> | |
str = "" + str | |
while (str.length < length) | |
str = " " + str | |
return str | |
# `get` - HTTP get `path` and call `handler` with the content of the response | |
get = (path, handler) -> | |
content = [] | |
options.path = path | |
http.get options, (res) -> | |
res.on 'data', (chunk) -> | |
content.push chunk | |
res.on 'end', -> | |
handler(content.join "") | |
# `visit` - Recursively crawl pages (breadth first one request at a time) starting at `path`. | |
visit = (path) -> | |
if visited[path] or path.match /^mailto:/ | |
return visit(queue.shift()) | |
visited[path] = true; | |
start = (new Date).getTime() | |
get path, (content) -> | |
diff = (new Date).getTime() - start | |
console.log "#{padl diff} ms #{path}" | |
results.push {time: diff, path: path} | |
$(content).find('a').each -> | |
queue.push($(this).attr('href')) | |
visit(queue.shift()) | |
# Start the crawler! | |
visit '/' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment