Skip to content

Instantly share code, notes, and snippets.

@taktamur
Last active September 21, 2024 11:18
Show Gist options
  • Save taktamur/a14804f67db7715ca42c392f9be40a85 to your computer and use it in GitHub Desktop.
Save taktamur/a14804f67db7715ca42c392f9be40a85 to your computer and use it in GitHub Desktop.
複数のwebページを1つのpdfにまとめる
{
"tasks": {
// pdfsを削除して作り直す
"clean": "rm -rf pdfs && mkdir pdfs",
// paths.txtを読み込み、各URLをスクレイピングしてpdfにする
"run": "cat paths-cloudflare-workers.txt | PUPPETEER_PRODUCT=chrome deno run --allow-env --allow-write --allow-read --allow-run --allow-net url2pdf.ts https://developers.cloudflare.com/workers/",
"run_pages": "cat urls-cloudflare-pages.txt | PUPPETEER_PRODUCT=chrome deno run --allow-env --allow-write --allow-read --allow-run --allow-net url2pdf.ts",
// 作成したpdfを結合する
"combine": "pdftk pdfs/*.pdf cat output cloudflare-pages.pdf",
// 指定されたsitemap.xmlをcurlで取得し、xml2jsonを使ってxmlをjsonに変換して、jqでURLを抽出する
"sitemap2urls": "curl -s https://developers.cloudflare.com/sitemap-0.xml > sitemap.xml; xml2json sitemap.xml | jq '.urlset.url[].loc' -r > sitemap-url.txt"
}
}
// see: https://deno.land/x/[email protected]
import { readAll } from "https://deno.land/[email protected]/io/util.ts";
import puppeteer from "https://deno.land/x/[email protected]/mod.ts";
import { Command } from "https://deno.land/x/[email protected]/command/mod.ts";
// セットアップ用コマンド
// PUPPETEER_PRODUCT=chrome deno run -A --unstable https://deno.land/x/[email protected]/install.ts
// 標準入力からpathのリストを受け取る
async function readPathsFromStdin(): Promise<string[]> {
const decoder = new TextDecoder("utf-8");
const input = await readAll(Deno.stdin);
const inputText = decoder.decode(input).trim();
return inputText.split("\n");
}
async function convertUrlsToPdf(urls: string[] = []) {
// pathからpdfのパスを生成する
function generatePdfPath(url: string) {
const pageName = url.replace(/\//g, "_");
return `./pdfs/${pageName}.pdf`;
}
// 処理済みのpdfを取得しておく(チェック用)
async function findExistingPdfPaths(urls: string[]): Promise<Set<string>> {
const existingPdfPaths = new Set<string>();
for (const url of urls) {
const pdfPath = generatePdfPath(url);
if (
await Deno.stat(pdfPath)
.then(() => true)
.catch(() => false)
) {
existingPdfPaths.add(pdfPath);
}
}
return existingPdfPaths;
}
const existingPdfPaths = await findExistingPdfPaths(urls);
const browser = await puppeteer.launch();
for (const [i, url] of urls.entries()) {
const pdfPath = generatePdfPath(url);
if (existingPdfPaths.has(pdfPath)) {
console.log(`skip ${url}`);
continue;
}
console.log(`[${i + 1}/${urls.length}] ${url}`);
// ヘッドレスブラウザでページを開いてPDFに変換する
const page = await browser.newPage();
await page.goto(url);
await page.pdf({ path: pdfPath, format: "A4" });
}
await browser.close();
}
if (import.meta.main) {
await new Command()
.name("url2pdf")
.version("0.1.0")
.description("Convert URLs to PDF files.")
.action(async (_options) => {
const paths = await readPathsFromStdin();
convertUrlsToPdf(paths);
})
.parse(Deno.args);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment