Skip to content

Instantly share code, notes, and snippets.

@mandeepsmagh
Last active June 10, 2026 10:00
Show Gist options
  • Select an option

  • Save mandeepsmagh/c34b022d2cb0abe78f0ddd1f1feeb6b7 to your computer and use it in GitHub Desktop.

Select an option

Save mandeepsmagh/c34b022d2cb0abe78f0ddd1f1feeb6b7 to your computer and use it in GitHub Desktop.
macOS OCR using Vision framework

macOS Local Offline OCR - CLI

A Swift script using Apple's built-in Vision framework for fully offline, private OCR on images and PDFs. Tested on macOS 26 (Tahoe).

Installation

  1. Paste the script into a file:

    nano ~/.local/bin/ocr
  2. Make it executable:

    chmod +x ~/.local/bin/ocr
  3. Ensure ~/.local/bin is in your PATH (add to ~/.zshrc if needed):

    export PATH="$HOME/.local/bin:$PATH"

Script

#!/usr/bin/swift

import Foundation
import Vision
import PDFKit
import AppKit

struct Options {
    var files: [String] = []
    var languages: [Locale.Language]? = nil
    var listLanguages = false
    var pageSpec: String? = nil
    var noPageHeader = false
}

func die(_ message: String, code: Int32 = 1) -> Never {
    fputs("\(message)\n", stderr)
    exit(code)
}

func usage() {
    print("""
    usage: ocr [options] file1 [file2 ...]

    options:
      --help, -h            show help
      --lang LANGS          recognition languages, comma-separated (e.g. en-US,zh-Hant)
      --langs               list supported recognition languages and exit
      --list-languages      same as --langs
      --pages SPEC          PDF page selector; default: all
                            examples: 1-3, 1,3,5, odd, even, first, last, 'all,!1-2'
                            note: quote specs containing ! in your shell, e.g. --pages 'all,!1-2'
      --no-page-header      omit '=== Page N ===' for PDFs

    examples:
      ocr image.png
      ocr doc.pdf
      ocr doc.pdf --pages 1-2
      ocr doc.pdf --pages odd
      ocr doc.pdf --pages 'all,!1-2'
      ocr doc.pdf --lang zh-Hant,en-US
      ocr *.png > output.txt
    """)
}

// helpers

func formatLanguage(_ lang: Locale.Language) -> String {
    var parts: [String] = []
    if let code   = lang.languageCode?.identifier  { parts.append(code) }
    if let script = lang.script?.identifier         { parts.append(script) }
    if let region = lang.region?.identifier         { parts.append(region) }
    return parts.joined(separator: "-")
}

func listLanguages() {
    var request = RecognizeTextRequest()
    request.recognitionLevel = .accurate
    request.supportedRecognitionLanguages
        .map { formatLanguage($0) }
        .sorted()
        .forEach { print($0) }
}

func parseLanguages(_ raw: String?) -> [Locale.Language]? {
    guard let raw else { return nil }
    let langs = raw
        .split(separator: ",")
        .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
        .filter { !$0.isEmpty }
        .map { Locale.Language(identifier: $0) }
    return langs.isEmpty ? nil : langs
}

// Page selection

func normalize(_ s: String) -> String {
    s.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
}

func pageSet(_ range: ClosedRange<Int>, max pageCount: Int) -> Set<Int> {
    guard pageCount > 0 else { return [] }
    let lo = max(1, min(range.lowerBound, range.upperBound))
    let hi = min(pageCount, max(range.lowerBound, range.upperBound))
    return lo <= hi ? Set(lo...hi) : []
}

func tokenPages(_ token: String, pageCount: Int) -> Set<Int> {
    let t = normalize(token)
    switch t {
    case "":      return []
    case "all":   return pageCount > 0 ? Set(1...pageCount) : []
    case "odd":   return Set((1...pageCount).filter { !$0.isMultiple(of: 2) })
    case "even":  return Set((1...pageCount).filter { $0.isMultiple(of: 2) })
    case "first": return pageCount >= 1 ? [1] : []
    case "last":  return pageCount >= 1 ? [pageCount] : []
    default:      break
    }

    if let n = Int(t), (1...pageCount).contains(n) { return [n] }

    let parts = t.split(separator: "-", omittingEmptySubsequences: false).map(String.init)
    guard parts.count == 2 else { return [] }

    func resolveBound(_ s: String, defaultValue: Int) -> Int? {
        switch normalize(s) {
        case "":      return defaultValue
        case "first": return 1
        case "last":  return pageCount
        default:      return Int(normalize(s))
        }
    }

    guard let a = resolveBound(parts[0], defaultValue: 1),
          let b = resolveBound(parts[1], defaultValue: pageCount) else { return [] }
    return pageSet(a...b, max: pageCount)
}

func selectPages(spec: String?, pageCount: Int) -> Set<Int> {
    guard pageCount > 0 else { return [] }
    guard let spec, !spec.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else {
        return Set(1...pageCount)
    }

    let tokens = spec.split(separator: ",").map(String.init)
    let hasPositive = tokens.contains { !normalize($0).hasPrefix("!") }

    return tokens.reduce(into: hasPositive ? Set<Int>() : Set(1...pageCount)) { selected, raw in
        let t = normalize(raw)
        let excluded = t.hasPrefix("!")
        let body = excluded ? String(t.dropFirst()) : t
        let pages = tokenPages(body, pageCount: pageCount)
        if excluded { selected.subtract(pages) } else { selected.formUnion(pages) }
    }
}

// Text sorting

func sortedText(_ observations: [RecognizedTextObservation]) -> [String] {
    observations
        .sorted {
            let ay = $0.boundingBox.origin.y
            let by = $1.boundingBox.origin.y
            if abs(ay - by) > 0.02 { return ay > by }
            return $0.boundingBox.origin.x < $1.boundingBox.origin.x
        }
        .compactMap {
            $0.topCandidates(1).first?.string
                .trimmingCharacters(in: .whitespacesAndNewlines)
        }
        .filter { !$0.isEmpty }
}

// Image OCR

func ocrImage(_ url: URL, languages: [Locale.Language]?) async -> [String] {
    var request = RecognizeTextRequest()
    request.recognitionLevel = .accurate
    request.usesLanguageCorrection = true
    request.automaticallyDetectsLanguage = (languages == nil)

    if let languages {
        request.recognitionLanguages = languages
    }

    do {
        return sortedText(try await request.perform(on: url))
    } catch {
        fputs("warning: couldn't recognize text in \(url.lastPathComponent): \(error)\n", stderr)
        return []
    }
}

// PDF rendering

func renderPDFPage(_ page: PDFPage, scale: CGFloat = 3.0) -> URL? {
    let bounds = page.bounds(for: .mediaBox)
    let width  = max(Int(bounds.width  * scale), 1)
    let height = max(Int(bounds.height * scale), 1)

    guard let colorSpace = CGColorSpace(name: CGColorSpace.sRGB),
          let ctx = CGContext(
              data: nil, width: width, height: height,
              bitsPerComponent: 8, bytesPerRow: 0, space: colorSpace,
              bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
          ) else { return nil }

    ctx.setFillColor(NSColor.white.cgColor)
    ctx.fill(CGRect(x: 0, y: 0, width: width, height: height))
    ctx.saveGState()
    ctx.translateBy(x: 0, y: CGFloat(height))
    ctx.scaleBy(x: scale, y: -scale)
    page.draw(with: .mediaBox, to: ctx)
    ctx.restoreGState()

    guard let cgImage = ctx.makeImage() else { return nil }

    let image = NSImage(cgImage: cgImage, size: NSSize(width: width, height: height))
    guard let tiff   = image.tiffRepresentation,
          let bitmap = NSBitmapImageRep(data: tiff),
          let png    = bitmap.representation(using: .png, properties: [:]) else { return nil }

    let tmp = FileManager.default.temporaryDirectory
        .appendingPathComponent(UUID().uuidString)
        .appendingPathExtension("png")

    do {
        try png.write(to: tmp)
        return tmp
    } catch {
        fputs("warning: couldn't write temp image: \(error)\n", stderr)
        return nil
    }
}

// PDF OCR (sequential — page order must be preserved)

func ocrPDF(
    _ url: URL,
    languages: [Locale.Language]?,
    pageSpec: String?,
    noPageHeader: Bool
) async -> [String] {
    guard let doc = PDFDocument(url: url) else {
        fputs("warning: couldn't open PDF \(url.lastPathComponent)\n", stderr)
        return []
    }

    let selected = selectPages(spec: pageSpec, pageCount: doc.pageCount)
    var out: [String] = []

    for n in 1...doc.pageCount where selected.contains(n) {
        guard let page = doc.page(at: n - 1) else { continue }

        let embedded = (page.string ?? "").trimmingCharacters(in: .whitespacesAndNewlines)
        if !embedded.isEmpty {
            if !noPageHeader { out.append("=== Page \(n) ===") }
            out.append(embedded)
            continue
        }

        guard let tmp = renderPDFPage(page) else {
            fputs("warning: couldn't render page \(n) of \(url.lastPathComponent)\n", stderr)
            continue
        }

        let lines = await ocrImage(tmp, languages: languages)
        try? FileManager.default.removeItem(at: tmp)

        if !lines.isEmpty {
            if !noPageHeader { out.append("=== Page \(n) ===") }
            out.append(contentsOf: lines)
        }
    }

    return out
}

// File dispatch
// Parallel across files, sequential within each PDF.
// --pages is ignored (with a warning) for non-PDF files.

func processFile(
    _ file: String,
    languages: [Locale.Language]?,
    pageSpec: String?,
    noPageHeader: Bool
) async -> String? {
    let path = NSString(string: file).expandingTildeInPath
    guard FileManager.default.fileExists(atPath: path) else {
        fputs("warning: file not found: \(file)\n", stderr)
        return nil
    }

    let url   = URL(fileURLWithPath: path)
    let isPDF = url.pathExtension.lowercased() == "pdf"

    if pageSpec != nil && !isPDF {
        fputs("warning: --pages ignored for image file: \(url.lastPathComponent)\n", stderr)
    }

    let lines = isPDF
        ? await ocrPDF(url, languages: languages, pageSpec: pageSpec, noPageHeader: noPageHeader)
        : await ocrImage(url, languages: languages)

    return lines.isEmpty ? nil : lines.joined(separator: "\n")
}

// Arg parsing

func parseOptions(_ args: [String]) -> Options {
    var options = Options()
    var i = 0
    while i < args.count {
        switch args[i] {
        case "--help", "-h":
            usage(); exit(0)
        case "--langs", "--list-languages":
            options.listLanguages = true; i += 1
        case "--lang":
            guard i + 1 < args.count else { die("--lang requires a value") }
            options.languages = parseLanguages(args[i + 1]); i += 2
        case "--pages":
            guard i + 1 < args.count else { die("--pages requires a value") }
            options.pageSpec = args[i + 1]; i += 2
        case "--no-page-header":
            options.noPageHeader = true; i += 1
        default:
            options.files.append(args[i]); i += 1
        }
    }
    return options
}

// Entry point

let options = parseOptions(Array(CommandLine.arguments.dropFirst()))

if options.listLanguages {
    listLanguages()
    exit(0)
}

if options.files.isEmpty {
    usage(); exit(1)
}

Task {
    // Parallel across files — tag with input index to restore order after collection
    let outputs: [String] = await withTaskGroup(
        of: (index: Int, result: String?).self,
        returning: [String].self
    ) { group in
        for (index, file) in options.files.enumerated() {
            let languages    = options.languages
            let pageSpec     = options.pageSpec
            let noPageHeader = options.noPageHeader
            group.addTask {
                let result = await processFile(
                    file,
                    languages: languages,
                    pageSpec: pageSpec,
                    noPageHeader: noPageHeader
                )
                return (index: index, result: result)
            }
        }

        var collected: [(index: Int, result: String)] = []
        for await item in group {
            if let result = item.result {
                collected.append((index: item.index, result: result))
            }
        }
        return collected.sorted { $0.index < $1.index }.map { $0.result }
    }

    if !outputs.isEmpty {
        print(outputs.joined(separator: "\n\n"))
    }

    exit(0)
}

RunLoop.main.run()

Usage

# Basic image OCR
ocr image.png

# PDF — all pages
ocr doc.pdf

# PDF — page ranges
ocr doc.pdf --pages 1-3
ocr doc.pdf --pages 1,3,5
ocr doc.pdf --pages odd
ocr doc.pdf --pages even
ocr doc.pdf --pages first
ocr doc.pdf --pages last

# PDF — exclude pages (quote the ! to prevent shell history expansion)
ocr doc.pdf --pages '!1-2'

# Specific language(s)
ocr doc.pdf --lang zh-Hant,en-US

# Suppress '=== Page N ===' headers
ocr doc.pdf --no-page-header

# Batch images to file
ocr *.png > output.txt

# List all supported recognition languages
ocr --langs

Shell note: Any --pages spec containing ! must be single-quoted in bash/zsh to prevent history expansion, e.g. --pages '!1-2' not --pages all,!1-2.

How It Works

  • Uses Apple Vision RecognizeTextRequest at .accurate level
  • Auto-detects language unless --lang is specified
  • PDFs: uses embedded text if present; otherwise renders each page to PNG at 3× scale and OCRs the image
  • Multiple files processed in parallel; pages within each PDF processed sequentially to preserve order
  • --pages spec supports ranges, keywords, comma lists, and ! exclusions
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment