複数のwebページを1つのpdfにまとめる https://scrapbox.io/taktamur/%E8%A4%87%E6%95%B0%E3%81%AEweb%E3%83%9A%E3%83%BC%E3%82%B8%E3%82%92%EF%BC%91%E3%81%A4%E3%81%AEpdf%E3%81%AB%E3%81%BE%E3%81%A8%E3%82%81%E3%82%8B
Last active
September 21, 2024 11:18
-
-
Save taktamur/a14804f67db7715ca42c392f9be40a85 to your computer and use it in GitHub Desktop.
複数のwebページを1つのpdfにまとめる
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Show hidden characters
{ | |
"tasks": { | |
// pdfsを削除して作り直す | |
"clean": "rm -rf pdfs && mkdir pdfs", | |
// paths.txtを読み込み、各URLをスクレイピングしてpdfにする | |
"run": "cat paths-cloudflare-workers.txt | PUPPETEER_PRODUCT=chrome deno run --allow-env --allow-write --allow-read --allow-run --allow-net url2pdf.ts https://developers.cloudflare.com/workers/", | |
"run_pages": "cat urls-cloudflare-pages.txt | PUPPETEER_PRODUCT=chrome deno run --allow-env --allow-write --allow-read --allow-run --allow-net url2pdf.ts", | |
// 作成したpdfを結合する | |
"combine": "pdftk pdfs/*.pdf cat output cloudflare-pages.pdf", | |
// 指定されたsitemap.xmlをcurlで取得し、xml2jsonを使ってxmlをjsonに変換して、jqでURLを抽出する | |
"sitemap2urls": "curl -s https://developers.cloudflare.com/sitemap-0.xml > sitemap.xml; xml2json sitemap.xml | jq '.urlset.url[].loc' -r > sitemap-url.txt" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// see: https://deno.land/x/[email protected] | |
import { readAll } from "https://deno.land/[email protected]/io/util.ts"; | |
import puppeteer from "https://deno.land/x/[email protected]/mod.ts"; | |
import { Command } from "https://deno.land/x/[email protected]/command/mod.ts"; | |
// セットアップ用コマンド | |
// PUPPETEER_PRODUCT=chrome deno run -A --unstable https://deno.land/x/[email protected]/install.ts | |
// 標準入力からpathのリストを受け取る | |
async function readPathsFromStdin(): Promise<string[]> { | |
const decoder = new TextDecoder("utf-8"); | |
const input = await readAll(Deno.stdin); | |
const inputText = decoder.decode(input).trim(); | |
return inputText.split("\n"); | |
} | |
async function convertUrlsToPdf(urls: string[] = []) { | |
// pathからpdfのパスを生成する | |
function generatePdfPath(url: string) { | |
const pageName = url.replace(/\//g, "_"); | |
return `./pdfs/${pageName}.pdf`; | |
} | |
// 処理済みのpdfを取得しておく(チェック用) | |
async function findExistingPdfPaths(urls: string[]): Promise<Set<string>> { | |
const existingPdfPaths = new Set<string>(); | |
for (const url of urls) { | |
const pdfPath = generatePdfPath(url); | |
if ( | |
await Deno.stat(pdfPath) | |
.then(() => true) | |
.catch(() => false) | |
) { | |
existingPdfPaths.add(pdfPath); | |
} | |
} | |
return existingPdfPaths; | |
} | |
const existingPdfPaths = await findExistingPdfPaths(urls); | |
const browser = await puppeteer.launch(); | |
for (const [i, url] of urls.entries()) { | |
const pdfPath = generatePdfPath(url); | |
if (existingPdfPaths.has(pdfPath)) { | |
console.log(`skip ${url}`); | |
continue; | |
} | |
console.log(`[${i + 1}/${urls.length}] ${url}`); | |
// ヘッドレスブラウザでページを開いてPDFに変換する | |
const page = await browser.newPage(); | |
await page.goto(url); | |
await page.pdf({ path: pdfPath, format: "A4" }); | |
} | |
await browser.close(); | |
} | |
if (import.meta.main) { | |
await new Command() | |
.name("url2pdf") | |
.version("0.1.0") | |
.description("Convert URLs to PDF files.") | |
.action(async (_options) => { | |
const paths = await readPathsFromStdin(); | |
convertUrlsToPdf(paths); | |
}) | |
.parse(Deno.args); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment