Created
November 26, 2015 15:57
-
-
Save harrygr/aec8180cb43d992a3af1 to your computer and use it in GitHub Desktop.
Paul Graham Essay Generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"require": { | |
"fabpot/goutte": "^3.1", | |
"ucsdmath/pdf": "^1.4" | |
}, | |
"autoload": { | |
"psr-4": { | |
"": "" | |
} | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/** | |
* Paul Graham Essay Generator | |
* | |
* Author: Harry G (http://github.com/harrygr) | |
* Copyright: Licensed under the GPL-3 (http://www.gnu.org/licenses/gpl-3.0.html) | |
* | |
* This script scrapes Paul Graham's site and generates a PDF file of all the essays | |
* | |
* Usage: | |
* - copy the file and composer.json into a directory | |
* - run `composer install` to install the dependencies | |
* - run it from your terminal with `php pgessays.php` | |
* | |
* A PDF file named "PGs Essays.pdf" will be saved in the current directory | |
* | |
* To prevent repeated DDoS of Paul's site the script caches the results of the scrape | |
* in a sections.json file. Delete this file if you want a fresh copy of the essays (i.e. if some new ones are added). | |
* | |
* L | |
* | |
*/ | |
require "vendor/autoload.php"; | |
use Goutte\Client; | |
use UCSDMath\Pdf\Pdf; | |
class PgEssayGenerator { | |
private $client; | |
private $pdf; | |
private $main_url = 'http://www.paulgraham.com/articles.html'; | |
private $sections = []; | |
public function __construct(Client $client, Pdf $pdf) | |
{ | |
$this->client = $client; | |
$this->pdf = $pdf; | |
} | |
public function generate() | |
{ | |
$crawler = $this->client->request('GET', $this->main_url); | |
if (!file_exists('sections.json')) { | |
$crawler->filter('table td[width=435] a')->each(function ($node) { | |
$title = $node->text(); | |
$url = $node->link()->getUri(); | |
$this->addSection($url, $title); | |
}); | |
file_put_contents('sections.json', json_encode($this->sections)); | |
} else { | |
$this->sections = json_decode(file_get_contents('sections.json'), true); | |
} | |
$this->buildPdf(); | |
} | |
private function addSection($link, $title) | |
{ | |
echo "Generating section '$title'" . PHP_EOL; | |
$crawler = $this->client->request('GET', $link); | |
$node = $crawler->filter('table[width=435] td[width=435] font, table[width=374] td[width=374] font'); | |
if (count($node)) { | |
$body = $node->html(); | |
} else { | |
$body = file_get_contents($link); | |
} | |
$this->sections[] = [ | |
'title' => $title, | |
'body' => $body, | |
]; | |
} | |
private function buildPdf() | |
{ | |
$this->pdf | |
->initializePageSetup('A4', 'Portrait') | |
->setOutputDestination('F') | |
->setFilename('PGs Essays.pdf'); | |
$count = count($this->sections); | |
foreach ($this->sections as $index => $section) { | |
$n = $index + 1; | |
echo "Adding section $n of $count: '{$section['title']}' to PDF" . PHP_EOL; | |
if (is_string($section['title']) and is_string($section['body'])) { | |
$this->pdf->appendPageContent('<h1>' . $section['title'] . '</h1>'); | |
$this->pdf->appendPageContent($section['body']); | |
} | |
} | |
$this->pdf->render(); | |
} | |
} | |
$generator = new PgEssayGenerator(new Client, new Pdf); | |
$generator->generate(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi,
I had to change line 49 to
$crawler->filter('table:nth-child(6) td[width=435] a')->each(function ($node) {
to avoid scraping the top table (with 'suggested links').Also, the PDF library no longer works well with modern php (it loads an old version of mpdf), but I haven't had the chance to look into upgrading that..
Thanks for the script!