Last active
November 27, 2018 13:13
-
-
Save dvdbng/e2ec950ff63983932b7b19e0c8f273ac to your computer and use it in GitHub Desktop.
Download page DOM with CSS and images
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Note: Due to CORS, this only works in chromium with disabled web security. | |
Start chrome like this: chromium-browser --disable-web-security --user-data-dir=/tmp/chrometmp | |
Load the page you want and paste the script in the web console. | |
This will save a snapshoot of the DOM and inline all CSS and images so that usually the page will | |
look exactly the same, but there is no javascript in the page. | |
Note that the generated file can be large in size, because the same URL might be inlined more than once. | |
*/ | |
var CSS_IMPORT = /@import\s*["']([^"']+)["']/g | |
var CSS_URL = /\burl\(("[^"]+"|'[^']+'|[^"')][^)]+)\)/g | |
var BAD_CSS = /(-moz-binding|expression\s*\(|javascript\s*:)/gi | |
var URL = (window.URL || window.webkitURL) | |
function loadJszip() { | |
return new Promise(function(resolve, reject) { | |
window.define = null; | |
if (window.JSZip) return resolve(); | |
var imported = document.createElement('script'); | |
imported.src = 'https://unpkg.com/[email protected]/dist/jszip.min.js'; | |
document.head.appendChild(imported); | |
setTimeout(function check() { | |
console.log('Check for jszip'); | |
if (window.JSZip) return resolve(); | |
setTimeout(check, 50); | |
}, 50); | |
}); | |
} | |
var URL_ATTRIBUTES = { | |
img_src: true, | |
link_href: true, | |
input_src: true, | |
body_background: true, | |
table_background: true, | |
td_background: true, | |
tr_background: true, | |
th_background: true, | |
tbody_background: true, | |
thead_background: true, | |
tfoot_background: true, | |
col_background: true, | |
colgroup_background: true, | |
section_background: true, | |
head_profile: true, | |
html_manifest: true, | |
command_icon: true, | |
embed_src: true, | |
object_data: true, | |
video_poster: true, | |
}; | |
BLOCKED_ATTRIBUTES = { | |
iframe_src: true, | |
script_src: true, | |
img_srcset: true, | |
} | |
function isUrlAttribute(tagName, attribute) { | |
var key = tagName.toLowerCase() + '_' + attribute.toLowerCase(); | |
return URL_ATTRIBUTES[key] || false; | |
} | |
function isBlockedAttribute(tagName, attribute) { | |
var key = tagName.toLowerCase() + '_' + attribute.toLowerCase(); | |
return BLOCKED_ATTRIBUTES[key] || false; | |
} | |
function downloadPage() { | |
var resources = {}; // content -> file | |
var resource_index = 0; | |
function getFileName(mime) { | |
if (/^text\/(css|html)/.test(mime)) return `${++resource_index}.${RegExp.$1}`; | |
if (/^image\/(gif|jpeg|png|svg)/.test(mime)) return `${++resource_index}.${RegExp.$1}`; | |
if (/^text\/plain/.test(mime)) return `${++resource_index}.txt`; | |
if (/^image\/x-icon/.test(mime)) return `${++resource_index}.ico`; | |
return `${++resource_index}.bin`; | |
} | |
async function processCss(css_source, base_uri) { | |
if (!css_source) return ''; | |
function unscape(str) { | |
return str.replace(/\\([a-f0-9]+) ?/gi, function(str, charcode) { | |
return String.fromCharCode(parseInt(charcode, 16)); | |
}); | |
} | |
function unquote(str) { | |
return str.replace(/^["']+/g, '').replace(/["']+$/g, ''); | |
} | |
let contents = {}; | |
css_source.replace(CSS_IMPORT, (match, url) => contents[unscape(url)] = null) | |
.replace(CSS_URL, (match, url) => contents[unscape(unquote(url))] = null); | |
let urls = Object.keys(contents); | |
(await Promise.all(Object.keys(contents).map(url => inlineUrl(url, base_uri)))).forEach((resp, i) => { | |
contents[urls[i]] = resp; | |
}); | |
css_source = css_source.replace(CSS_IMPORT, (match, url) => `@import "${contents[unscape(url)]}"`); | |
css_source = css_source.replace(CSS_URL, (match, url) => `url("${contents[unscape(unquote(url))]}")`); | |
css_source = css_source.replace(BAD_CSS, 'blocked'); | |
return css_source | |
} | |
async function blobToDataURL(blob) { | |
return new Promise(function(resolve, reject) { | |
var a = new FileReader(); | |
a.onload = function(e) { resolve(e.target.result); } | |
a.readAsDataURL(blob); | |
}); | |
} | |
async function inlineUrl(url, baseurl) { | |
const resolved = new URL(url, baseurl).href; | |
if (/^data:/i.test(resolved)) { return resolved; } | |
if (inlineUrl.cache[resolved]) { | |
return inlineUrl.cache[resolved]; | |
} | |
const resp = await fetch(resolved); | |
let data; | |
if (/^text\/css/.test(resp.headers.get('Content-Type'))) { | |
data = await processCss(await resp.text(), resolved); | |
} else { | |
data = await resp.blob(); | |
} | |
var file_name = getFileName(resp.headers.get('Content-Type')) | |
resources[file_name] = data; | |
inlineUrl.cache[resolved] = file_name; | |
return file_name; | |
} | |
inlineUrl.cache = {}; | |
async function visitNode(elm) { | |
switch (elm.nodeType) { | |
case Node.TEXT_NODE: | |
if (elm.parentNode && elm.parentNode.tagName == 'STYLE') { | |
elm.textContent = await processCss(elm.textContent, elm.baseURI); | |
} else if (elm.parentNode && elm.parentNode.tagName == 'SCRIPT') { | |
elm.textContent = ''; | |
} | |
break; | |
case Node.ELEMENT_NODE: | |
for (var i = 0; i < elm.attributes.length; i++) { | |
var attr = elm.attributes[i]; | |
if(attr.name == "style"){ | |
attr.value = await processCss(attr.value, elm.baseURI); | |
} else if (isUrlAttribute(elm.tagName, attr.name)){ | |
attr.value = await inlineUrl(attr.value, elm.baseURI); | |
} else if (/^on/.test(attr.name) || isBlockedAttribute(elm.tagName, attr.name)) { | |
attr.value = ''; | |
} | |
} | |
let promises = []; | |
for (var child = elm.firstChild; child; child = child.nextSibling) { | |
promises.push(visitNode(child)); | |
} | |
await Promise.all(promises); | |
break; | |
} | |
} | |
function download(filename, blob) { | |
//let blob = new Blob([text], {type: 'text/html'}); | |
var element = document.createElement('a'); | |
element.setAttribute('href', URL.createObjectURL(blob)); | |
element.setAttribute('download', filename); | |
element.style.display = 'none'; | |
document.body.appendChild(element); | |
element.click(); | |
} | |
async function getInline(elm) { | |
const clone = elm.cloneNode(true); | |
await Promise.all([ | |
visitNode(clone), | |
loadJszip(), | |
]); | |
var zip = new JSZip(); | |
var doctype = document.doctype; | |
var doctype_str = "<!DOCTYPE " | |
+ doctype.name | |
+ (doctype.publicId ? ' PUBLIC "' + doctype.publicId + '"' : '') | |
+ (!doctype.publicId && doctype.systemId ? ' SYSTEM' : '') | |
+ (doctype.systemId ? ' "' + doctype.systemId + '"' : '') | |
+ '>'; | |
zip.file("index.html", doctype_str + '\n' + clone.outerHTML); | |
Object.entries(resources).forEach(([file, data]) => { | |
zip.file(file, data); | |
}) | |
var content = await zip.generateAsync({type:"blob"}); | |
var zip_name = (document.title || location.href).replace(/[^a-z0-9]/gi, '') + '.zip' | |
download(zip_name, content); | |
} | |
getInline(document.documentElement); | |
} | |
downloadPage(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment