A Swift script using Apple's built-in Vision framework for fully offline, private OCR on images and PDFs. Tested on macOS 26 (Tahoe).
-
Paste the script into a file:
nano ~/.local/bin/ocr -
Make it executable:
chmod +x ~/.local/bin/ocr -
Ensure
~/.local/binis in your PATH (add to~/.zshrcif needed):export PATH="$HOME/.local/bin:$PATH"
#!/usr/bin/swift
import Foundation
import Vision
import PDFKit
import AppKit
struct Options {
var files: [String] = []
var languages: [Locale.Language]? = nil
var listLanguages = false
var pageSpec: String? = nil
var noPageHeader = false
}
func die(_ message: String, code: Int32 = 1) -> Never {
fputs("\(message)\n", stderr)
exit(code)
}
func usage() {
print("""
usage: ocr [options] file1 [file2 ...]
options:
--help, -h show help
--lang LANGS recognition languages, comma-separated (e.g. en-US,zh-Hant)
--langs list supported recognition languages and exit
--list-languages same as --langs
--pages SPEC PDF page selector; default: all
examples: 1-3, 1,3,5, odd, even, first, last, 'all,!1-2'
note: quote specs containing ! in your shell, e.g. --pages 'all,!1-2'
--no-page-header omit '=== Page N ===' for PDFs
examples:
ocr image.png
ocr doc.pdf
ocr doc.pdf --pages 1-2
ocr doc.pdf --pages odd
ocr doc.pdf --pages 'all,!1-2'
ocr doc.pdf --lang zh-Hant,en-US
ocr *.png > output.txt
""")
}
// helpers
func formatLanguage(_ lang: Locale.Language) -> String {
var parts: [String] = []
if let code = lang.languageCode?.identifier { parts.append(code) }
if let script = lang.script?.identifier { parts.append(script) }
if let region = lang.region?.identifier { parts.append(region) }
return parts.joined(separator: "-")
}
func listLanguages() {
var request = RecognizeTextRequest()
request.recognitionLevel = .accurate
request.supportedRecognitionLanguages
.map { formatLanguage($0) }
.sorted()
.forEach { print($0) }
}
func parseLanguages(_ raw: String?) -> [Locale.Language]? {
guard let raw else { return nil }
let langs = raw
.split(separator: ",")
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
.filter { !$0.isEmpty }
.map { Locale.Language(identifier: $0) }
return langs.isEmpty ? nil : langs
}
// Page selection
func normalize(_ s: String) -> String {
s.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
}
func pageSet(_ range: ClosedRange<Int>, max pageCount: Int) -> Set<Int> {
guard pageCount > 0 else { return [] }
let lo = max(1, min(range.lowerBound, range.upperBound))
let hi = min(pageCount, max(range.lowerBound, range.upperBound))
return lo <= hi ? Set(lo...hi) : []
}
func tokenPages(_ token: String, pageCount: Int) -> Set<Int> {
let t = normalize(token)
switch t {
case "": return []
case "all": return pageCount > 0 ? Set(1...pageCount) : []
case "odd": return Set((1...pageCount).filter { !$0.isMultiple(of: 2) })
case "even": return Set((1...pageCount).filter { $0.isMultiple(of: 2) })
case "first": return pageCount >= 1 ? [1] : []
case "last": return pageCount >= 1 ? [pageCount] : []
default: break
}
if let n = Int(t), (1...pageCount).contains(n) { return [n] }
let parts = t.split(separator: "-", omittingEmptySubsequences: false).map(String.init)
guard parts.count == 2 else { return [] }
func resolveBound(_ s: String, defaultValue: Int) -> Int? {
switch normalize(s) {
case "": return defaultValue
case "first": return 1
case "last": return pageCount
default: return Int(normalize(s))
}
}
guard let a = resolveBound(parts[0], defaultValue: 1),
let b = resolveBound(parts[1], defaultValue: pageCount) else { return [] }
return pageSet(a...b, max: pageCount)
}
func selectPages(spec: String?, pageCount: Int) -> Set<Int> {
guard pageCount > 0 else { return [] }
guard let spec, !spec.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
return Set(1...pageCount)
}
let tokens = spec.split(separator: ",").map(String.init)
let hasPositive = tokens.contains { !normalize($0).hasPrefix("!") }
return tokens.reduce(into: hasPositive ? Set<Int>() : Set(1...pageCount)) { selected, raw in
let t = normalize(raw)
let excluded = t.hasPrefix("!")
let body = excluded ? String(t.dropFirst()) : t
let pages = tokenPages(body, pageCount: pageCount)
if excluded { selected.subtract(pages) } else { selected.formUnion(pages) }
}
}
// Text sorting
func sortedText(_ observations: [RecognizedTextObservation]) -> [String] {
observations
.sorted {
let ay = $0.boundingBox.origin.y
let by = $1.boundingBox.origin.y
if abs(ay - by) > 0.02 { return ay > by }
return $0.boundingBox.origin.x < $1.boundingBox.origin.x
}
.compactMap {
$0.topCandidates(1).first?.string
.trimmingCharacters(in: .whitespacesAndNewlines)
}
.filter { !$0.isEmpty }
}
// Image OCR
func ocrImage(_ url: URL, languages: [Locale.Language]?) async -> [String] {
var request = RecognizeTextRequest()
request.recognitionLevel = .accurate
request.usesLanguageCorrection = true
request.automaticallyDetectsLanguage = (languages == nil)
if let languages {
request.recognitionLanguages = languages
}
do {
return sortedText(try await request.perform(on: url))
} catch {
fputs("warning: couldn't recognize text in \(url.lastPathComponent): \(error)\n", stderr)
return []
}
}
// PDF rendering
func renderPDFPage(_ page: PDFPage, scale: CGFloat = 3.0) -> URL? {
let bounds = page.bounds(for: .mediaBox)
let width = max(Int(bounds.width * scale), 1)
let height = max(Int(bounds.height * scale), 1)
guard let colorSpace = CGColorSpace(name: CGColorSpace.sRGB),
let ctx = CGContext(
data: nil, width: width, height: height,
bitsPerComponent: 8, bytesPerRow: 0, space: colorSpace,
bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
) else { return nil }
ctx.setFillColor(NSColor.white.cgColor)
ctx.fill(CGRect(x: 0, y: 0, width: width, height: height))
ctx.saveGState()
ctx.translateBy(x: 0, y: CGFloat(height))
ctx.scaleBy(x: scale, y: -scale)
page.draw(with: .mediaBox, to: ctx)
ctx.restoreGState()
guard let cgImage = ctx.makeImage() else { return nil }
let image = NSImage(cgImage: cgImage, size: NSSize(width: width, height: height))
guard let tiff = image.tiffRepresentation,
let bitmap = NSBitmapImageRep(data: tiff),
let png = bitmap.representation(using: .png, properties: [:]) else { return nil }
let tmp = FileManager.default.temporaryDirectory
.appendingPathComponent(UUID().uuidString)
.appendingPathExtension("png")
do {
try png.write(to: tmp)
return tmp
} catch {
fputs("warning: couldn't write temp image: \(error)\n", stderr)
return nil
}
}
// PDF OCR (sequential — page order must be preserved)
func ocrPDF(
_ url: URL,
languages: [Locale.Language]?,
pageSpec: String?,
noPageHeader: Bool
) async -> [String] {
guard let doc = PDFDocument(url: url) else {
fputs("warning: couldn't open PDF \(url.lastPathComponent)\n", stderr)
return []
}
let selected = selectPages(spec: pageSpec, pageCount: doc.pageCount)
var out: [String] = []
for n in 1...doc.pageCount where selected.contains(n) {
guard let page = doc.page(at: n - 1) else { continue }
let embedded = (page.string ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
if !embedded.isEmpty {
if !noPageHeader { out.append("=== Page \(n) ===") }
out.append(embedded)
continue
}
guard let tmp = renderPDFPage(page) else {
fputs("warning: couldn't render page \(n) of \(url.lastPathComponent)\n", stderr)
continue
}
let lines = await ocrImage(tmp, languages: languages)
try? FileManager.default.removeItem(at: tmp)
if !lines.isEmpty {
if !noPageHeader { out.append("=== Page \(n) ===") }
out.append(contentsOf: lines)
}
}
return out
}
// File dispatch
// Parallel across files, sequential within each PDF.
// --pages is ignored (with a warning) for non-PDF files.
func processFile(
_ file: String,
languages: [Locale.Language]?,
pageSpec: String?,
noPageHeader: Bool
) async -> String? {
let path = NSString(string: file).expandingTildeInPath
guard FileManager.default.fileExists(atPath: path) else {
fputs("warning: file not found: \(file)\n", stderr)
return nil
}
let url = URL(fileURLWithPath: path)
let isPDF = url.pathExtension.lowercased() == "pdf"
if pageSpec != nil && !isPDF {
fputs("warning: --pages ignored for image file: \(url.lastPathComponent)\n", stderr)
}
let lines = isPDF
? await ocrPDF(url, languages: languages, pageSpec: pageSpec, noPageHeader: noPageHeader)
: await ocrImage(url, languages: languages)
return lines.isEmpty ? nil : lines.joined(separator: "\n")
}
// Arg parsing
func parseOptions(_ args: [String]) -> Options {
var options = Options()
var i = 0
while i < args.count {
switch args[i] {
case "--help", "-h":
usage(); exit(0)
case "--langs", "--list-languages":
options.listLanguages = true; i += 1
case "--lang":
guard i + 1 < args.count else { die("--lang requires a value") }
options.languages = parseLanguages(args[i + 1]); i += 2
case "--pages":
guard i + 1 < args.count else { die("--pages requires a value") }
options.pageSpec = args[i + 1]; i += 2
case "--no-page-header":
options.noPageHeader = true; i += 1
default:
options.files.append(args[i]); i += 1
}
}
return options
}
// Entry point
let options = parseOptions(Array(CommandLine.arguments.dropFirst()))
if options.listLanguages {
listLanguages()
exit(0)
}
if options.files.isEmpty {
usage(); exit(1)
}
Task {
// Parallel across files — tag with input index to restore order after collection
let outputs: [String] = await withTaskGroup(
of: (index: Int, result: String?).self,
returning: [String].self
) { group in
for (index, file) in options.files.enumerated() {
let languages = options.languages
let pageSpec = options.pageSpec
let noPageHeader = options.noPageHeader
group.addTask {
let result = await processFile(
file,
languages: languages,
pageSpec: pageSpec,
noPageHeader: noPageHeader
)
return (index: index, result: result)
}
}
var collected: [(index: Int, result: String)] = []
for await item in group {
if let result = item.result {
collected.append((index: item.index, result: result))
}
}
return collected.sorted { $0.index < $1.index }.map { $0.result }
}
if !outputs.isEmpty {
print(outputs.joined(separator: "\n\n"))
}
exit(0)
}
RunLoop.main.run()# Basic image OCR
ocr image.png
# PDF — all pages
ocr doc.pdf
# PDF — page ranges
ocr doc.pdf --pages 1-3
ocr doc.pdf --pages 1,3,5
ocr doc.pdf --pages odd
ocr doc.pdf --pages even
ocr doc.pdf --pages first
ocr doc.pdf --pages last
# PDF — exclude pages (quote the ! to prevent shell history expansion)
ocr doc.pdf --pages '!1-2'
# Specific language(s)
ocr doc.pdf --lang zh-Hant,en-US
# Suppress '=== Page N ===' headers
ocr doc.pdf --no-page-header
# Batch images to file
ocr *.png > output.txt
# List all supported recognition languages
ocr --langsShell note: Any
--pagesspec containing!must be single-quoted in bash/zsh to prevent history expansion, e.g.--pages '!1-2'not--pages all,!1-2.
- Uses Apple Vision
RecognizeTextRequestat.accuratelevel - Auto-detects language unless
--langis specified - PDFs: uses embedded text if present; otherwise renders each page to PNG at 3× scale and OCRs the image
- Multiple files processed in parallel; pages within each PDF processed sequentially to preserve order
--pagesspec supports ranges, keywords, comma lists, and!exclusions