debrouwere · June 8, 2014 22:45
diff --git a/cas.coffee b/cas.coffee
 ### An interesting thing about news website homepages is that, while they
 change all the time, the media on them doesn't change quite that fast: both
 actual images but also stylesheets, javascript, logos and so on. Therefore,
 when archiving these pages, it is possible to achieve significant space
 savings by modifying the links to every image or other resource in the HTML to
 instead refer to a file path that's a hash of the file's content: a type of 
 content-addressable storage.

 The storage size can be further reduced by storing e.g. a day's worth of HTML
 for one page (e.g. one fetch every hour) into a single lzip file, as LZMA can 
 very efficiently compress repetitions of almost-identical content, such as a
 homepage changing over time. ###

 request = require 'request'
 cheerio = require 'cheerio'
 async = require 'async'
 url = require 'url'
 fs = require 'fs'
 fs.path = require 'path'
 fs.mkdirp = require 'mkdirp'
 crypto = require 'crypto'
 _ = require 'underscore'
 _.str = require 'underscore.string'

 chunk = (l, n) ->
    (l.slice i, i+n) for i in _.range 0, l.length, n


 save = (content, extension, callback) ->
    sum = crypto.createHash 'sha1'
    sum.update content
    digest = sum.digest 'hex'
    chunkedDigest = (chunk digest, 4).join '/'
    filename = chunkedDigest + extension
    path = fs.path.join 'filestore', filename

    console.log path

    fs.exists path, (exists) ->
        if exists
            callback null, filename
        else
            mkdir = async.apply fs.mkdirp, (fs.path.dirname path)
            write = async.apply fs.writeFile, path, content, undefined
            done = (err) -> callback err, filename
            async.waterfall [mkdir, write], done


 # TODO: ideally do a HEAD request first that looks at the last-modified
 # date and/or ETag, checks those against a cache we keep of 
 # <url> | <last_modified> | <hash> (w/ a TTL on each key, or a LRU)
 # and if the last_modified we have corresponds to the last_modified
 # of the HEAD request, we can simply use the existing hash, 
 # which cuts down on both bandwidth and processing time
 download = (link, callback) ->
    extension = fs.path.extname link
    request.get {uri: link, encoding: null}, (err, res, body) ->
        save body, extension, callback


 # TODO: it should be possible to specify, per site and independently
 # for the homepage and for article pages, whether we can run a 
 # straight GET request or whether we need to run it in a 
 # headless browser, and extract the computed HTML instead
 # (perhaps in some cases after executing an action, like a 
 # scroll event)
 page = 'http://theguardian.com/uk'
 request.get page, (err, res, body) ->
    $ = cheerio.load(body)
    hrefs = ($ ':not(a)[href]').map -> ($ this).attr 'href'
    srcs = ($ ':not(a)[src]').map -> ($ this).attr 'src'
    relativeLinks = hrefs.concat srcs
    absoluteLinks = relativeLinks.map _.partial url.resolve, page
    # links =  _.zip relativeLinks, absoluteLinks

    async.map absoluteLinks, download, (err, digests) ->
        links = _.object _.zip relativeLinks, digests

        # TODO: ideally we'd also rewrite any CSS file and 
        # suck out references, because they may contain
        # sprites and such

        # for both reliability and speed, don't recreate the 
        # HTML from the cheerio representation, but 
        # simply replace relative links with absolute ones, 
        # our content addresses
        for link, hash of links
            body = body.replace link, hash, 'g'

        save body, '.html', (err, digest) ->
            # TODO: at this point, you'd want to make a symlink 
            # from the digest (a.k.a. the content-addressable filename)
            # to something that actually makes sense to human beings, 
            # like /theguardian.com/2014/01/01/23:17.html
            console.log digest
	### An interesting thing about news website homepages is that, while they
	change all the time, the media on them doesn't change quite that fast: both
	actual images but also stylesheets, javascript, logos and so on. Therefore,
	when archiving these pages, it is possible to achieve significant space
	savings by modifying the links to every image or other resource in the HTML to
	instead refer to a file path that's a hash of the file's content: a type of
	content-addressable storage.

	The storage size can be further reduced by storing e.g. a day's worth of HTML
	for one page (e.g. one fetch every hour) into a single lzip file, as LZMA can
	very efficiently compress repetitions of almost-identical content, such as a
	homepage changing over time. ###

	request = require 'request'
	cheerio = require 'cheerio'
	async = require 'async'
	url = require 'url'
	fs = require 'fs'
	fs.path = require 'path'
	fs.mkdirp = require 'mkdirp'
	crypto = require 'crypto'
	_ = require 'underscore'
	_.str = require 'underscore.string'

	chunk = (l, n) ->
	(l.slice i, i+n) for i in _.range 0, l.length, n


	save = (content, extension, callback) ->
	sum = crypto.createHash 'sha1'
	sum.update content
	digest = sum.digest 'hex'
	chunkedDigest = (chunk digest, 4).join '/'
	filename = chunkedDigest + extension
	path = fs.path.join 'filestore', filename

	console.log path

	fs.exists path, (exists) ->
	if exists
	callback null, filename
	else
	mkdir = async.apply fs.mkdirp, (fs.path.dirname path)
	write = async.apply fs.writeFile, path, content, undefined
	done = (err) -> callback err, filename
	async.waterfall [mkdir, write], done


	# TODO: ideally do a HEAD request first that looks at the last-modified
	# date and/or ETag, checks those against a cache we keep of
	# <url> \| <last_modified> \| <hash> (w/ a TTL on each key, or a LRU)
	# and if the last_modified we have corresponds to the last_modified
	# of the HEAD request, we can simply use the existing hash,
	# which cuts down on both bandwidth and processing time
	download = (link, callback) ->
	extension = fs.path.extname link
	request.get {uri: link, encoding: null}, (err, res, body) ->
	save body, extension, callback


	# TODO: it should be possible to specify, per site and independently
	# for the homepage and for article pages, whether we can run a
	# straight GET request or whether we need to run it in a
	# headless browser, and extract the computed HTML instead
	# (perhaps in some cases after executing an action, like a
	# scroll event)
	page = 'http://theguardian.com/uk'
	request.get page, (err, res, body) ->
	$ = cheerio.load(body)
	hrefs = ($ ':not(a)[href]').map -> ($ this).attr 'href'
	srcs = ($ ':not(a)[src]').map -> ($ this).attr 'src'
	relativeLinks = hrefs.concat srcs
	absoluteLinks = relativeLinks.map _.partial url.resolve, page
	# links = _.zip relativeLinks, absoluteLinks

	async.map absoluteLinks, download, (err, digests) ->
	links = _.object _.zip relativeLinks, digests

	# TODO: ideally we'd also rewrite any CSS file and
	# suck out references, because they may contain
	# sprites and such

	# for both reliability and speed, don't recreate the
	# HTML from the cheerio representation, but
	# simply replace relative links with absolute ones,
	# our content addresses
	for link, hash of links
	body = body.replace link, hash, 'g'

	save body, '.html', (err, digest) ->
	# TODO: at this point, you'd want to make a symlink
	# from the digest (a.k.a. the content-addressable filename)
	# to something that actually makes sense to human beings,
	# like /theguardian.com/2014/01/01/23:17.html
	console.log digest