Skip to content

Instantly share code, notes, and snippets.

@rcarmo
Created August 18, 2017 11:53
Show Gist options
  • Save rcarmo/f9223997a4a295c0537c303086a3c35a to your computer and use it in GitHub Desktop.
Save rcarmo/f9223997a4a295c0537c303086a3c35a to your computer and use it in GitHub Desktop.
ES6 script for creating web site snapshots in .webarchive format (WIP)
const bplist = require('bplist-creator'),
request = require('request'),
xpath = require('xpath'),
dom = require('xmldom').DOMParser,
url = require('url'),
fs = require('fs');
const USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30',
ACCEPT_HEADERS = {
"User-Agent": USER_AGENT,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-us",
"Accept-Encoding": "gzip"
},
XPATH_PATTERNS = ['//img/@src', '//img/@data-src', '//img/@data-src-retina', '//img/@data-cid', '//script/@src', "//link[@rel='stylesheet']/@href"];
function requestAsync(url) {
return new Promise((resolve, reject) => {
var req = request({
followAllRedirects: true,
url: url,
headers: ACCEPT_HEADERS,
gzip: true,
encoding: null
}, (err, response, body) => {
if (err) return reject(err, response, body);
resolve({
"url": url,
"headers": response.headers,
"body": body
});
});
});
};
async function getParallel(urls) {
try {
return await Promise.all(urls.map(requestAsync));
} catch (err) {
console.error(err);
}
}
var target = "http://help.websiteos.com/websiteos/example_of_a_simple_html_page.htm";
var target = "https://theverge.com";
requestAsync(target).then((result) => {
if (result.headers['content-type'].includes("text/html")) {
var doc = new dom().parseFromString(String(result.body)),
urls = XPATH_PATTERNS.map((pattern) => {
var nodes = xpath.select(pattern, doc);
return nodes.map((x) => {return url.resolve(target, x.value)});
}).reduce((a,b) => a.concat(b))
.filter((x) => x.substring(0,4) ==="http") // fetchable only
.filter((v,i,a) => a.indexOf(v) === i); // unique
getParallel(urls).then((results) => {
var webarchive = {
'WebMainResource': {
"WebResourceData": result.body,
"WebResourceMIMEType": "text/html",
"WebResourceURL": target
},
'WebSubresources': results.map((r) => {
return {
"WebResourceData": r.body,
"WebResourceMIMEType": r.headers['content-type'],
"WebResourceURL": r.url
}
})
}
fs.writeFile("test.webarchive", bplist(webarchive), "binary", (err) => console.log(err));
}).catch((reason) => console.log(reason));
}
}).catch((reason) => console.log(reason));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment