Skip to content

Instantly share code, notes, and snippets.

@lpenaud
Last active November 25, 2025 16:45
Show Gist options
  • Select an option

  • Save lpenaud/dbcf1877a8dd2138f9ac64f4a01a4646 to your computer and use it in GitHub Desktop.

Select an option

Save lpenaud/dbcf1877a8dd2138f9ac64f4a01a4646 to your computer and use it in GitHub Desktop.
PDF ocr + compress

OCR PDF

Just a Deno script to run tesseract and ghostscript to produce small pdf with text.

You can use picture or pdf as infiles whatever supported by Tesseract and GhostScript.

Usage

deno run --allow-read \
  --allow-write \
  --allow-run=tesseract,gs \
  https://gist.githubusercontent.com/lpenaud/dbcf1877a8dd2138f9ac64f4a01a4646/raw/ebook.ts \
  input.tiff \
  output.pdf

Dependencies

#!/usr/bin/env -S deno run --allow-read --allow-write --allow-run=tesseract,gs
import * as stdPath from "jsr:@std/path";
class VerboseCommand extends Deno.Command {
#command: string[];
constructor(command: string | URL, options: Deno.CommandOptions) {
super(command, options);
this.#command = [typeof command === "string" ? command : command.href];
if (options.args !== undefined) {
this.#command.push(...options.args);
}
}
override output(): Promise<Deno.CommandOutput> {
this.#log();
return super.output();
}
override outputSync(): Deno.CommandOutput {
this.#log();
return super.outputSync();
}
override spawn(): Deno.ChildProcess {
this.#log();
return super.spawn();
}
#log() {
console.log(...this.#command);
}
}
interface GhostScriptOptions {
infile: string;
outfile: string;
}
// gs -sDEVICE=pdfwrite -dCompatibilityLevel=1.4 "-dPDFSETTINGS=/ebook" -dNOPAUSE -dQUIET -dBATCH -sOutputFile=final.pdf output.pdf
class GhostScriptCommand extends VerboseCommand {
constructor({ infile, outfile }: GhostScriptOptions) {
super("gs", {
args: [
"-sDEVICE=pdfwrite",
"-dCompatibilityLevel=1.4",
// https://ghostscript.readthedocs.io/en/latest/VectorDevices.html#controls-and-features-specific-to-postscript-and-pdf-input
// /screen,/printer,/prepress,/default
"-dPDFSETTINGS=/ebook",
"-dNOPAUSE",
"-dQUIET",
"-dBATCH",
`-sOutputFile=${stdPath.resolve(outfile)}`,
stdPath.resolve(infile),
],
});
}
}
interface TesseractOptions {
language: string;
infile: string;
outfile: string;
}
class TesseractCommand extends VerboseCommand {
constructor({ infile, language, outfile }: TesseractOptions) {
const { dir, name } = stdPath.parse(outfile);
super("tesseract", {
args: [
"-l",
language,
stdPath.resolve(infile),
stdPath.resolve(dir, name),
"pdf",
],
});
}
}
interface CleanOptions {
temp: string;
}
function clean({ temp }: CleanOptions) {
Deno.removeSync(temp);
console.log("Removed:", temp);
}
function main(args: string[]): number {
if (args.length !== 2) {
console.log(
"Usage:",
import.meta.filename ?? import.meta.url,
"INFILE",
"OUTFILE",
);
return 1;
}
const [infile, outfile] = args;
const temp = Deno.makeTempFileSync({
prefix: "ebook",
suffix: ".pdf",
});
const tesseract = new TesseractCommand({
infile,
language: "fra",
outfile: temp,
});
const tesseractOutput = tesseract.outputSync();
if (!tesseractOutput.success) {
clean({ temp });
return tesseractOutput.code;
}
const gs = new GhostScriptCommand({
infile: temp,
outfile,
});
const gsOutput = gs.outputSync();
clean({ temp });
return gsOutput.code;
}
if (import.meta.main) {
Deno.exit(main(Deno.args.slice()));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment