Skip to content

Instantly share code, notes, and snippets.

@5HT2
Created May 22, 2025 20:22
Show Gist options
  • Save 5HT2/58c466929ee767cff7edc0f333194c77 to your computer and use it in GitHub Desktop.
Save 5HT2/58c466929ee767cff7edc0f333194c77 to your computer and use it in GitHub Desktop.
dump-pdfs
dump-pdfs-recent () {
local loc="$(whence -p fd)" || return $?
local arg=(-j 16 -u -e pdf --changed-within)
local arq=(-q -1)
local acw="$1"
if (( ! ${#acw} )); then
print "Usage: dump-pdfs-recent [time (3m | 1h)] [fd args]..."
return 64
fi
# Remove --changed-within timestamp from rest of args for fd
shift
# Check for files first
fd $arg $acw $arq $@ || {
print "fatal: Didn't find any valid PDF files to process!"
return 127
}
# Exec dump-pdfs
fd $arg $acw $@ -X dump-pdfs -rename $aff
}
#!/usr/bin/env zsh -opipefail
local clear_args() {
unset DUMP_PDFS_ARG DUMP_PDFS_PDF DUMP_PDFS_OCR DUMP_PDFS_DEBUG DUMP_PDFS_PAPER DUMP_PDFS_RENAME
}
local pdf
local tsArg=(-s "%.T")
local files=()
clear_args
DUMP_PDFS_ARG=("$@")
DUMP_PDFS_PDF=()
DUMP_PDFS_OCR=0
DUMP_PDFS_DEBUG=0
DUMP_PDFS_PAPER=0
DUMP_PDFS_RENAME=0
# Usage: log [message]
# log [exit code] [message] [print args]
# log 0 "this is a debug log"
# log 2 "this is a usage err"
# log "this is a normal log"
local log() {
local msg="[0] # $1"
# If an exit code is given explicitly
if (( ${#2} )); then
local sig="$1"
local msg="[$sig] # $2"
shift 2
# Print to stderr & return exit code
if (( sig > 0 )); then
print >&2 $@ "$msg"
return $sig
fi
# Got an exit code <= 0, print debug logs
if (( DUMP_PDFS_DEBUG )); then
print $@ "$msg"
fi
return 0
fi
shift
print $@ "$msg"
}
# Ensure we have necessary tools
local parse_pkgs () {
command -v pdftotext >/dev/null 2>&1 || {
log 1 "pdftotext not found. Please install poppler-utils." || return $?
}
command -v ocrmypdf >/dev/null 2>&1 || {
log 1 "ocrmypdf not found. Please install pdftotext." || return $?
}
}
# Parse provided user args
local parse_args () {
for arg in "${DUMP_PDFS_ARG[@]}"; do
case $arg in;
"-paper")
DUMP_PDFS_PAPER=1
;;
"-rename")
DUMP_PDFS_RENAME=1
;;
"-ocr")
DUMP_PDFS_OCR=1
;;
"-debug")
DUMP_PDFS_DEBUG=1
;;
*)
files+=("$arg");;
esac
done
# Parse file args
if (( ! ${#files} )); then
log "No files specified, defaulting to all"
files+=(*.pdf)
log "Proceesing ${#files} files"
fi
# Source zshcmds for parr command
if (( debug )); then
source ~/.zshcmds
log 0 "=== Processed args ==="
log 0 "Files [${#files}]: $(cat - < <(parr files 2>&1))"
log 0 "=== Processed args ==="
fi
}
# Fix filenames (batched, not individual)
#
# ^([^ ]*)([A-z_\. \-\(\)]*)( ?(_|-) Microsoft.+)(\.pdf)$
# ^([^\ ]*) ?(\(?[A-z_\.\-]*\)?)(([_\-\ ]*)(PowerShell[^\ ]*)?(.+)[_\-\ ]*Microsoft.+)(\.pdf)$
local process_rename () {
if (( ! DUMP_PDFS_RENAME )); then
log 0 "Skipped renaming files"
return 0
fi
local filesNew=()
local fRgx="Microsoft ?Learn.pdf"
local rNum=0
local sNum=0
log "Processing batch rename for ${#files} files..."
for pdf in $files; do
local fNew=$''$pdf''
# Skip adding file to the process queue
if [[ "${pdf:e}" != "pdf" ]]; then
log 1 "File isn't a PDF, skipped: '${pdf}'"
continue
fi
# Skip adding non-existent files to the process queue
if [[ ! -f $''$pdf'' ]]; then
log 1 "File nonexistent, skipped: '${pdf}'"
continue
fi
# Process rename only if it matches cleanup regex
if [[ ! ( $''$pdf'' =~ $fRgx ) ]]; then
log 0 "Regex notmatch, skipped rename: $pdf"
(( sNum += 1 ))
filesNew+=($''$pdf'')
continue
fi
# Process regex sub
fNew=$''$(printf '%s' $''$pdf'' | gsed -E 's/^([^\ ]*) ?(\(?[A-z_\.\-]*\)?)(([_\-\ ]*)(PowerShell[^\ ]*)?(.+)[_\-\ ]*Microsoft.+)(\.pdf)$/\1.pdf/g')''
fErr="Failed to rename '${pdf}' → '${fNew}'"
# Check if filename is shorter than the original
# No reason to rename if it's just making the filename worse
if (( ${#fNew} < ${#pdf} )); then
log "Rename: $pdf → $fNew"
# Check if dest exists
if [[ -f $''$fNew'' ]]; then
log 1 "${fErr}, destination exists" || return $?
fi
# Rename
mv $''$pdf'' $''$fNew'' || {
log 1 "${fErr}, mv failed" || return $?
}
# Check if new filename for the PDF exists
if [[ ! -f $''$fNew'' ]]; then
if [[ -f $''$pdf'' ]]; then
log 1 "${fErr}, original file still exists?" || return $?
fi
# Literally how
log 1 "${fErr}, new filename doesn't exist?" || return $?
fi
(( rNum += 1 ))
filesNew+=($''$fNew'')
else
(( sNum += 1 ))
filesNew+=($''$pdf'')
fi
done
# Print stats for invalid files
local dNum=$(( ${#files} - rNum - sNum ))
if (( dNum > 0 )); then
log "Removed $dNum files from the processing queue"
fi
files=($filesNew)
log "Got ${#files} valid files, renamed ${rNum}, skipped ${sNum}"
}
# Function to process OCR for each PDF
process_ocr() {
if (( ! DUMP_PDFS_OCR )); then
return
fi
if [[ -f "$pdfO" ]]; then
log 0 "Skipped processing $pdf, found $pdfO"
return 0
fi
local ocrArgs=(--output-type pdfa-3)
local ocrArgsClean=(--redo-ocr --clean)
local ocrArgsFinal=(--deskew --rotate-pages --skip-text --force-ocr)
local pdf="$1"
local pdfC="${pdf:r}-clean.pdf"
local pdfO="${pdf:r}-ocr.pdf"
log "Processing OCR step (1/2) for $pdf"
ocrmypdf $ocrArgs $ocrArgsClean $pdf $pdfC
log "Processing OCR step (2/2) for $pdf"
ocrmypdf $ocrArgs $ocrArgsFinal $pdfC $pdfO
mv "$pdfC" "$pdfO"
log "Finished OCR on $pdfO"
}
# Function to process each PDF
process_pdf() {
local pdf="$1"
local txt="${pdf:r}.txt"
if [[ -f "$txt" ]]; then
log 0 "Skipped processing $pdf, found $txt"
return 0
fi
log "Extracting text from $pdf..."
# Extract text from PDF
pdftotext -layout "$pdf" "$txt" || {
log "fatal: pdftotext returned $?" || return $?
}
# Add paper template if specified
if (( DUMP_PDFS_PAPER )); then
log 0 "Adding template, -paper specified"
# Add APA style reference (placeholder, as we can't generate this automatically)
print "APA Reference (7th edition):" >"${txt}.temp"
print "[Insert APA reference here]" >>"${txt}.temp"
print "" >>"${txt}.temp"
# Add headers for required sections
print "1. Summary:" >>"${txt}.temp"
print "Purpose:" >>"${txt}.temp"
print "Participants:" >>"${txt}.temp"
print "Methods:" >>"${txt}.temp"
print "Results:" >>"${txt}.temp"
print "Conclusion:" >>"${txt}.temp"
print "" >>"${txt}.temp"
print "2. Critical Thinking:" >>"${txt}.temp"
print "Relevance and Significance:" >>"${txt}.temp"
print "" >>"${txt}.temp"
# Append the original extracted text
cat "$txt" >>"${txt}.temp"
# Replace original with formatted version
mv "${txt}.temp" "$txt"
fi
log "Extracted PDF contents: $pdf → $txt"
}
# Some PDFs contain these due to the way that they were rendered.
# Cleanup line endings, used to remove the following:
# - 0x0c (\f)
# - 0x0d 0x0a → 0x0a (\r\n → \n)
process_fmt() {
local pdf=$''$1''
local txt=$''${pdf:r}.txt''
if [[ ! -f $''$txt'' ]]; then
log 1 "Failed to cleanup PDF, no such file: $txt" || return $?
fi
# Check for existence of 0x0c, no need to run dos2unix otherwise
# We use this instead of (rg $'\x0c' -q) because it's twice as fast somehow
#xxd -c 1 -p $''$txt'' | rg '^0c$' -q || {
# log $? "Skipped cleaning up $txt"
# return 0
#}
# -q will return 0 if matched, 1 if not matched, otherwise error
rg $'\x0c' -q $''$txt'' || {
local sig=$?
if (( sig != 1 )); then
log $sig "Failed cleanup check, rg returned $sig" || return $?
fi
log 0 "Skipped cleaning up $txt"
return 0
}
log 0 "Cleaning up line endings in $txt..."
dos2unix "$txt" &>/dev/null || {
log $? "fatal: dos2unix returned $?" || return $?
}
log "Deleting 0x0c (form feed) characters from $txt..."
tr -d '\f' <"$txt" >"${txt}.temp"
mv "${txt}.temp" "$txt"
log "Finished cleaning up: $txt"
}
local run_proc_single() {
local pdfC="$pdf"
local pdfO="${pdf:r}-ocr.pdf"
process_ocr "$pdfC" || return $?
if [[ -f "$pdfO" ]]; then
log 0 "Using OCR'd version of $pdfC instead"
pdfC="$pdfO"
fi
process_pdf "$pdfC" || return $?
process_fmt "$pdfC" || return $?
}
local run () {
# Pre-process steps
parse_pkgs || return $?
parse_args || return $?
# Batch process rename
process_rename || return $?
# Process each PDF
for pdf in $files; do
run_proc_single || {
log $? "fatal: [$?] processing $pdf" || return $?
}
done
# Cleanup
clear_args
log "All PDFs processed."
}
{
run || {
log $? "Processing exited with an error" || exit $?
}
} 2>&1 | ts $tsArg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment