Created
June 8, 2014 22:45
-
-
Save debrouwere/0b545409caa2da836269 to your computer and use it in GitHub Desktop.
Using content-addressable storage to efficiently and continually archive new versions of an HTML page including all related resources (images etc.)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### An interesting thing about news website homepages is that, while they | |
change all the time, the media on them doesn't change quite that fast: both | |
actual images but also stylesheets, javascript, logos and so on. Therefore, | |
when archiving these pages, it is possible to achieve significant space | |
savings by modifying the links to every image or other resource in the HTML to | |
instead refer to a file path that's a hash of the file's content: a type of | |
content-addressable storage. | |
The storage size can be further reduced by storing e.g. a day's worth of HTML | |
for one page (e.g. one fetch every hour) into a single lzip file, as LZMA can | |
very efficiently compress repetitions of almost-identical content, such as a | |
homepage changing over time. ### | |
request = require 'request' | |
cheerio = require 'cheerio' | |
async = require 'async' | |
url = require 'url' | |
fs = require 'fs' | |
fs.path = require 'path' | |
fs.mkdirp = require 'mkdirp' | |
crypto = require 'crypto' | |
_ = require 'underscore' | |
_.str = require 'underscore.string' | |
chunk = (l, n) -> | |
(l.slice i, i+n) for i in _.range 0, l.length, n | |
save = (content, extension, callback) -> | |
sum = crypto.createHash 'sha1' | |
sum.update content | |
digest = sum.digest 'hex' | |
chunkedDigest = (chunk digest, 4).join '/' | |
filename = chunkedDigest + extension | |
path = fs.path.join 'filestore', filename | |
console.log path | |
fs.exists path, (exists) -> | |
if exists | |
callback null, filename | |
else | |
mkdir = async.apply fs.mkdirp, (fs.path.dirname path) | |
write = async.apply fs.writeFile, path, content, undefined | |
done = (err) -> callback err, filename | |
async.waterfall [mkdir, write], done | |
# TODO: ideally do a HEAD request first that looks at the last-modified | |
# date and/or ETag, checks those against a cache we keep of | |
# <url> | <last_modified> | <hash> (w/ a TTL on each key, or a LRU) | |
# and if the last_modified we have corresponds to the last_modified | |
# of the HEAD request, we can simply use the existing hash, | |
# which cuts down on both bandwidth and processing time | |
download = (link, callback) -> | |
extension = fs.path.extname link | |
request.get {uri: link, encoding: null}, (err, res, body) -> | |
save body, extension, callback | |
# TODO: it should be possible to specify, per site and independently | |
# for the homepage and for article pages, whether we can run a | |
# straight GET request or whether we need to run it in a | |
# headless browser, and extract the computed HTML instead | |
# (perhaps in some cases after executing an action, like a | |
# scroll event) | |
page = 'http://theguardian.com/uk' | |
request.get page, (err, res, body) -> | |
$ = cheerio.load(body) | |
hrefs = ($ ':not(a)[href]').map -> ($ this).attr 'href' | |
srcs = ($ ':not(a)[src]').map -> ($ this).attr 'src' | |
relativeLinks = hrefs.concat srcs | |
absoluteLinks = relativeLinks.map _.partial url.resolve, page | |
# links = _.zip relativeLinks, absoluteLinks | |
async.map absoluteLinks, download, (err, digests) -> | |
links = _.object _.zip relativeLinks, digests | |
# TODO: ideally we'd also rewrite any CSS file and | |
# suck out references, because they may contain | |
# sprites and such | |
# for both reliability and speed, don't recreate the | |
# HTML from the cheerio representation, but | |
# simply replace relative links with absolute ones, | |
# our content addresses | |
for link, hash of links | |
body = body.replace link, hash, 'g' | |
save body, '.html', (err, digest) -> | |
# TODO: at this point, you'd want to make a symlink | |
# from the digest (a.k.a. the content-addressable filename) | |
# to something that actually makes sense to human beings, | |
# like /theguardian.com/2014/01/01/23:17.html | |
console.log digest |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment