Created
May 22, 2025 20:22
-
-
Save 5HT2/58c466929ee767cff7edc0f333194c77 to your computer and use it in GitHub Desktop.
dump-pdfs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dump-pdfs-recent () { | |
local loc="$(whence -p fd)" || return $? | |
local arg=(-j 16 -u -e pdf --changed-within) | |
local arq=(-q -1) | |
local acw="$1" | |
if (( ! ${#acw} )); then | |
print "Usage: dump-pdfs-recent [time (3m | 1h)] [fd args]..." | |
return 64 | |
fi | |
# Remove --changed-within timestamp from rest of args for fd | |
shift | |
# Check for files first | |
fd $arg $acw $arq $@ || { | |
print "fatal: Didn't find any valid PDF files to process!" | |
return 127 | |
} | |
# Exec dump-pdfs | |
fd $arg $acw $@ -X dump-pdfs -rename $aff | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env zsh -opipefail | |
local clear_args() { | |
unset DUMP_PDFS_ARG DUMP_PDFS_PDF DUMP_PDFS_OCR DUMP_PDFS_DEBUG DUMP_PDFS_PAPER DUMP_PDFS_RENAME | |
} | |
local pdf | |
local tsArg=(-s "%.T") | |
local files=() | |
clear_args | |
DUMP_PDFS_ARG=("$@") | |
DUMP_PDFS_PDF=() | |
DUMP_PDFS_OCR=0 | |
DUMP_PDFS_DEBUG=0 | |
DUMP_PDFS_PAPER=0 | |
DUMP_PDFS_RENAME=0 | |
# Usage: log [message] | |
# log [exit code] [message] [print args] | |
# log 0 "this is a debug log" | |
# log 2 "this is a usage err" | |
# log "this is a normal log" | |
local log() { | |
local msg="[0] # $1" | |
# If an exit code is given explicitly | |
if (( ${#2} )); then | |
local sig="$1" | |
local msg="[$sig] # $2" | |
shift 2 | |
# Print to stderr & return exit code | |
if (( sig > 0 )); then | |
print >&2 $@ "$msg" | |
return $sig | |
fi | |
# Got an exit code <= 0, print debug logs | |
if (( DUMP_PDFS_DEBUG )); then | |
print $@ "$msg" | |
fi | |
return 0 | |
fi | |
shift | |
print $@ "$msg" | |
} | |
# Ensure we have necessary tools | |
local parse_pkgs () { | |
command -v pdftotext >/dev/null 2>&1 || { | |
log 1 "pdftotext not found. Please install poppler-utils." || return $? | |
} | |
command -v ocrmypdf >/dev/null 2>&1 || { | |
log 1 "ocrmypdf not found. Please install pdftotext." || return $? | |
} | |
} | |
# Parse provided user args | |
local parse_args () { | |
for arg in "${DUMP_PDFS_ARG[@]}"; do | |
case $arg in; | |
"-paper") | |
DUMP_PDFS_PAPER=1 | |
;; | |
"-rename") | |
DUMP_PDFS_RENAME=1 | |
;; | |
"-ocr") | |
DUMP_PDFS_OCR=1 | |
;; | |
"-debug") | |
DUMP_PDFS_DEBUG=1 | |
;; | |
*) | |
files+=("$arg");; | |
esac | |
done | |
# Parse file args | |
if (( ! ${#files} )); then | |
log "No files specified, defaulting to all" | |
files+=(*.pdf) | |
log "Proceesing ${#files} files" | |
fi | |
# Source zshcmds for parr command | |
if (( debug )); then | |
source ~/.zshcmds | |
log 0 "=== Processed args ===" | |
log 0 "Files [${#files}]: $(cat - < <(parr files 2>&1))" | |
log 0 "=== Processed args ===" | |
fi | |
} | |
# Fix filenames (batched, not individual) | |
# | |
# ^([^ ]*)([A-z_\. \-\(\)]*)( ?(_|-) Microsoft.+)(\.pdf)$ | |
# ^([^\ ]*) ?(\(?[A-z_\.\-]*\)?)(([_\-\ ]*)(PowerShell[^\ ]*)?(.+)[_\-\ ]*Microsoft.+)(\.pdf)$ | |
local process_rename () { | |
if (( ! DUMP_PDFS_RENAME )); then | |
log 0 "Skipped renaming files" | |
return 0 | |
fi | |
local filesNew=() | |
local fRgx="Microsoft ?Learn.pdf" | |
local rNum=0 | |
local sNum=0 | |
log "Processing batch rename for ${#files} files..." | |
for pdf in $files; do | |
local fNew=$''$pdf'' | |
# Skip adding file to the process queue | |
if [[ "${pdf:e}" != "pdf" ]]; then | |
log 1 "File isn't a PDF, skipped: '${pdf}'" | |
continue | |
fi | |
# Skip adding non-existent files to the process queue | |
if [[ ! -f $''$pdf'' ]]; then | |
log 1 "File nonexistent, skipped: '${pdf}'" | |
continue | |
fi | |
# Process rename only if it matches cleanup regex | |
if [[ ! ( $''$pdf'' =~ $fRgx ) ]]; then | |
log 0 "Regex notmatch, skipped rename: $pdf" | |
(( sNum += 1 )) | |
filesNew+=($''$pdf'') | |
continue | |
fi | |
# Process regex sub | |
fNew=$''$(printf '%s' $''$pdf'' | gsed -E 's/^([^\ ]*) ?(\(?[A-z_\.\-]*\)?)(([_\-\ ]*)(PowerShell[^\ ]*)?(.+)[_\-\ ]*Microsoft.+)(\.pdf)$/\1.pdf/g')'' | |
fErr="Failed to rename '${pdf}' → '${fNew}'" | |
# Check if filename is shorter than the original | |
# No reason to rename if it's just making the filename worse | |
if (( ${#fNew} < ${#pdf} )); then | |
log "Rename: $pdf → $fNew" | |
# Check if dest exists | |
if [[ -f $''$fNew'' ]]; then | |
log 1 "${fErr}, destination exists" || return $? | |
fi | |
# Rename | |
mv $''$pdf'' $''$fNew'' || { | |
log 1 "${fErr}, mv failed" || return $? | |
} | |
# Check if new filename for the PDF exists | |
if [[ ! -f $''$fNew'' ]]; then | |
if [[ -f $''$pdf'' ]]; then | |
log 1 "${fErr}, original file still exists?" || return $? | |
fi | |
# Literally how | |
log 1 "${fErr}, new filename doesn't exist?" || return $? | |
fi | |
(( rNum += 1 )) | |
filesNew+=($''$fNew'') | |
else | |
(( sNum += 1 )) | |
filesNew+=($''$pdf'') | |
fi | |
done | |
# Print stats for invalid files | |
local dNum=$(( ${#files} - rNum - sNum )) | |
if (( dNum > 0 )); then | |
log "Removed $dNum files from the processing queue" | |
fi | |
files=($filesNew) | |
log "Got ${#files} valid files, renamed ${rNum}, skipped ${sNum}" | |
} | |
# Function to process OCR for each PDF | |
process_ocr() { | |
if (( ! DUMP_PDFS_OCR )); then | |
return | |
fi | |
if [[ -f "$pdfO" ]]; then | |
log 0 "Skipped processing $pdf, found $pdfO" | |
return 0 | |
fi | |
local ocrArgs=(--output-type pdfa-3) | |
local ocrArgsClean=(--redo-ocr --clean) | |
local ocrArgsFinal=(--deskew --rotate-pages --skip-text --force-ocr) | |
local pdf="$1" | |
local pdfC="${pdf:r}-clean.pdf" | |
local pdfO="${pdf:r}-ocr.pdf" | |
log "Processing OCR step (1/2) for $pdf" | |
ocrmypdf $ocrArgs $ocrArgsClean $pdf $pdfC | |
log "Processing OCR step (2/2) for $pdf" | |
ocrmypdf $ocrArgs $ocrArgsFinal $pdfC $pdfO | |
mv "$pdfC" "$pdfO" | |
log "Finished OCR on $pdfO" | |
} | |
# Function to process each PDF | |
process_pdf() { | |
local pdf="$1" | |
local txt="${pdf:r}.txt" | |
if [[ -f "$txt" ]]; then | |
log 0 "Skipped processing $pdf, found $txt" | |
return 0 | |
fi | |
log "Extracting text from $pdf..." | |
# Extract text from PDF | |
pdftotext -layout "$pdf" "$txt" || { | |
log "fatal: pdftotext returned $?" || return $? | |
} | |
# Add paper template if specified | |
if (( DUMP_PDFS_PAPER )); then | |
log 0 "Adding template, -paper specified" | |
# Add APA style reference (placeholder, as we can't generate this automatically) | |
print "APA Reference (7th edition):" >"${txt}.temp" | |
print "[Insert APA reference here]" >>"${txt}.temp" | |
print "" >>"${txt}.temp" | |
# Add headers for required sections | |
print "1. Summary:" >>"${txt}.temp" | |
print "Purpose:" >>"${txt}.temp" | |
print "Participants:" >>"${txt}.temp" | |
print "Methods:" >>"${txt}.temp" | |
print "Results:" >>"${txt}.temp" | |
print "Conclusion:" >>"${txt}.temp" | |
print "" >>"${txt}.temp" | |
print "2. Critical Thinking:" >>"${txt}.temp" | |
print "Relevance and Significance:" >>"${txt}.temp" | |
print "" >>"${txt}.temp" | |
# Append the original extracted text | |
cat "$txt" >>"${txt}.temp" | |
# Replace original with formatted version | |
mv "${txt}.temp" "$txt" | |
fi | |
log "Extracted PDF contents: $pdf → $txt" | |
} | |
# Some PDFs contain these due to the way that they were rendered. | |
# Cleanup line endings, used to remove the following: | |
# - 0x0c (\f) | |
# - 0x0d 0x0a → 0x0a (\r\n → \n) | |
process_fmt() { | |
local pdf=$''$1'' | |
local txt=$''${pdf:r}.txt'' | |
if [[ ! -f $''$txt'' ]]; then | |
log 1 "Failed to cleanup PDF, no such file: $txt" || return $? | |
fi | |
# Check for existence of 0x0c, no need to run dos2unix otherwise | |
# We use this instead of (rg $'\x0c' -q) because it's twice as fast somehow | |
#xxd -c 1 -p $''$txt'' | rg '^0c$' -q || { | |
# log $? "Skipped cleaning up $txt" | |
# return 0 | |
#} | |
# -q will return 0 if matched, 1 if not matched, otherwise error | |
rg $'\x0c' -q $''$txt'' || { | |
local sig=$? | |
if (( sig != 1 )); then | |
log $sig "Failed cleanup check, rg returned $sig" || return $? | |
fi | |
log 0 "Skipped cleaning up $txt" | |
return 0 | |
} | |
log 0 "Cleaning up line endings in $txt..." | |
dos2unix "$txt" &>/dev/null || { | |
log $? "fatal: dos2unix returned $?" || return $? | |
} | |
log "Deleting 0x0c (form feed) characters from $txt..." | |
tr -d '\f' <"$txt" >"${txt}.temp" | |
mv "${txt}.temp" "$txt" | |
log "Finished cleaning up: $txt" | |
} | |
local run_proc_single() { | |
local pdfC="$pdf" | |
local pdfO="${pdf:r}-ocr.pdf" | |
process_ocr "$pdfC" || return $? | |
if [[ -f "$pdfO" ]]; then | |
log 0 "Using OCR'd version of $pdfC instead" | |
pdfC="$pdfO" | |
fi | |
process_pdf "$pdfC" || return $? | |
process_fmt "$pdfC" || return $? | |
} | |
local run () { | |
# Pre-process steps | |
parse_pkgs || return $? | |
parse_args || return $? | |
# Batch process rename | |
process_rename || return $? | |
# Process each PDF | |
for pdf in $files; do | |
run_proc_single || { | |
log $? "fatal: [$?] processing $pdf" || return $? | |
} | |
done | |
# Cleanup | |
clear_args | |
log "All PDFs processed." | |
} | |
{ | |
run || { | |
log $? "Processing exited with an error" || exit $? | |
} | |
} 2>&1 | ts $tsArg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment