Skip to content

Instantly share code, notes, and snippets.

@afiore
Created June 21, 2010 02:06
Show Gist options
  • Save afiore/446302 to your computer and use it in GitHub Desktop.
Save afiore/446302 to your computer and use it in GitHub Desktop.
Save a webpage (and its linked assets) into a single HTML file.
#! /usr/bin/env php
<?php
/***
* PageSnap
*
* Script for archiving a HTML page (and associated media assets) into a single file.
*
* Requires:
*
* - PhpQuery: http://code.google.com/p/phpquery
* - Parallel Curl: http://github.com/petewarden/parallelcurl
*/
require 'parallelcurl.php';
require 'phpQuery.php';
class PageSnap {
public $url=null;
private $doc=null;
//associative array mapping assets URLs to DOM Element
private $asset_urls=array();
private $multi_curl=null;
public $selectors=array(
'link[href]',
'script[src]',
'img[src]',
'object[src]'
//TODO: iframe frame
);
function __construct($url,$filename=null){
$this->url=$url;
$this->filename= $filename ? $filename : preg_replace('@^http://(www.)?@','',$url) . '.html';
$this->doc=phpQuery::newDocument(file_get_contents($url));
$this->multi_curl = new ParallelCurl(10,array(
CURLOPT_FOLLOWLOCATION => true,
CURLOPT_USERAGENT => "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.1) Gecko/20061204 Firefox/2.0.0.1",
));
return $this;
}
/**
* Parallel Curl callback.
*
*/
function set_data_url($response_body,$url,$ch,$element){
if (!in_array($url,$this->asset_urls)){
file_put_contents('php://stderr',"fetching {$url}\n");
$data=array();
$data[]= 'data:' . preg_replace('/;.*$/','',curl_getinfo($ch,CURLINFO_CONTENT_TYPE));
$data[]= 'base64,'.base64_encode($response_body);
foreach(array('href','src') as $attr) if ($element->hasAttribute($attr)) pq($element)->attr($attr,implode(';',$data));
$this->asset_urls[]=$url;
}
}
/**
* Resolve relative URLs.
*
* @author Stefano Faenza
* http://www.stefanoforenza.com/how-to-build-an-absolute-url-in-php/
*/
private function absolute_url($u,$p){
$url = parse_url( $u );
$page = parse_url( $p );
if ( strpos( $u , '/' ) === 0 )
{
//already absolute
} else {
$basePath = '';
if (
isset( $page[ 'path' ] )
&& strpos( ltrim( $page[ 'path' ], '/' ), '/' )
)
{
$baseTokens = explode( '/', $page[ 'path' ] );
array_pop( $baseTokens ); // strip basename
$baseTokens[] = $u;
$u = join( '/', $baseTokens );
}
}
if ( ! isset( $url[ 'host' ]))
{
$u = 'http://'.$page[ 'host' ].'/'.ltrim( $u, '/' );
}
return $u;
}
/**
* Maps assets URL to the DOM elements that reference them.
*
*/
private function get_urls($url){
foreach($this->selectors as $selector){
$i=0;
foreach(pq($selector) as $element){
//move on to the next iteration if link is not shortcut icon or stylesheet
if ( pq($element)->attr('rel') && !in_array( strtolower(pq($element)->attr('rel')), array('shortcut icon','stylesheet'))){
continue;
}
foreach(array('src','href') as $attr){
if ($element->hasAttribute($attr)) {
$absolute_url = $this->absolute_url(pq($element)->attr($attr), $url);
$this->asset_urls[$absolute_url]=$element;
}
}
}
}
}
/**
* Updates the DOM with the data URL its linked assets and saves the document in the current working directory.
*/
function fetch($url=null){
$url or $url = $this->url;
$this->get_urls($url);
foreach($this->asset_urls as $u => $element){
$this->multi_curl->startRequest($u,array($this,'set_data_url'),$element);
}
$this->multi_curl->finishAllRequests();
//TODO: gzip file
file_put_contents( basename($this->filename), $this->doc->html());
}
}
$p = new PageSnap($argv[1],$argv[2]);
$p->fetch();
?>
@ryangurn
Copy link

ryangurn commented Jan 1, 2015

can you work on a solution for css files that @import other css files? Is there any work around for that?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment