5HT2 · May 22, 2025 20:22
diff --git a/dump-pdfs-recent.zsh b/dump-pdfs-recent.zsh
 dump-pdfs-recent () {
    local loc="$(whence -p fd)" || return $?
    local arg=(-j 16 -u -e pdf --changed-within)
    local arq=(-q -1)
    local acw="$1"

    if (( ! ${#acw} )); then
        print "Usage: dump-pdfs-recent [time (3m | 1h)] [fd args]..."
        return 64
    fi

    # Remove --changed-within timestamp from rest of args for fd
    shift

    # Check for files first
    fd $arg $acw $arq $@ || {
        print "fatal: Didn't find any valid PDF files to process!"
        return 127
    }

    # Exec dump-pdfs
    fd $arg $acw $@ -X dump-pdfs -rename $aff
 }
diff --git a/dump-pdfs.zsh b/dump-pdfs.zsh
 #!/usr/bin/env zsh -opipefail

 local clear_args() {
    unset DUMP_PDFS_ARG DUMP_PDFS_PDF DUMP_PDFS_OCR DUMP_PDFS_DEBUG DUMP_PDFS_PAPER DUMP_PDFS_RENAME
 }

 local pdf
 local tsArg=(-s "%.T")
 local files=()

 clear_args
 DUMP_PDFS_ARG=("$@")
 DUMP_PDFS_PDF=()
 DUMP_PDFS_OCR=0
 DUMP_PDFS_DEBUG=0
 DUMP_PDFS_PAPER=0
 DUMP_PDFS_RENAME=0

 # Usage: log [message]
 #        log [exit code] [message] [print args]
 #        log 0 "this is a debug log"
 #        log 2 "this is a usage err"
 #        log "this is a normal log"
 local log() {
    local msg="[0] # $1"

    # If an exit code is given explicitly
    if (( ${#2} )); then
        local sig="$1"
        local msg="[$sig] # $2"
        shift 2

        # Print to stderr & return exit code
        if (( sig > 0 )); then
            print >&2 $@ "$msg"
            return $sig
        fi

        # Got an exit code <= 0, print debug logs
        if (( DUMP_PDFS_DEBUG )); then
            print $@ "$msg"
        fi

        return 0
    fi

    shift
    print $@ "$msg"
 }

 # Ensure we have necessary tools
 local parse_pkgs () {
    command -v pdftotext >/dev/null 2>&1 || {
        log 1 "pdftotext not found. Please install poppler-utils." || return $?
    }
    command -v ocrmypdf >/dev/null 2>&1 || {
        log 1 "ocrmypdf not found. Please install pdftotext." || return $?
    }
 }

 # Parse provided user args
 local parse_args () {
    for arg in "${DUMP_PDFS_ARG[@]}"; do
        case $arg in;
            "-paper")
                DUMP_PDFS_PAPER=1
                ;;
            "-rename")
                DUMP_PDFS_RENAME=1
                ;;
            "-ocr")
                DUMP_PDFS_OCR=1
                ;;
            "-debug")
                DUMP_PDFS_DEBUG=1
                ;;
            *)
                files+=("$arg");;
        esac
    done

    # Parse file args
    if (( ! ${#files} )); then
        log "No files specified, defaulting to all"
        files+=(*.pdf)
        log "Proceesing ${#files} files"
    fi

    # Source zshcmds for parr command
    if (( debug )); then
        source ~/.zshcmds

        log 0 "=== Processed args ==="
        log 0 "Files [${#files}]: $(cat - < <(parr files 2>&1))"
        log 0 "=== Processed args ==="
    fi
 }

 # Fix filenames (batched, not individual)
 #
 # ^([^ ]*)([A-z_\. \-\(\)]*)( ?(_|-) Microsoft.+)(\.pdf)$
 # ^([^\ ]*) ?(\(?[A-z_\.\-]*\)?)(([_\-\ ]*)(PowerShell[^\ ]*)?(.+)[_\-\ ]*Microsoft.+)(\.pdf)$
 local process_rename () {
    if (( ! DUMP_PDFS_RENAME )); then
        log 0 "Skipped renaming files"
        return 0
    fi

    local filesNew=()
    local fRgx="Microsoft ?Learn.pdf"
    local rNum=0
    local sNum=0

    log "Processing batch rename for ${#files} files..."
    for pdf in $files; do
        local fNew=$''$pdf''

        # Skip adding file to the process queue
        if [[ "${pdf:e}" != "pdf" ]]; then
            log 1 "File isn't a PDF, skipped: '${pdf}'"
            continue
        fi

        # Skip adding non-existent files to the process queue
        if [[ ! -f $''$pdf'' ]]; then
            log 1 "File nonexistent, skipped: '${pdf}'"
            continue
        fi

        # Process rename only if it matches cleanup regex
        if [[ ! ( $''$pdf'' =~ $fRgx ) ]]; then
            log 0 "Regex notmatch, skipped rename: $pdf"
            (( sNum += 1 ))
            filesNew+=($''$pdf'')
            continue
        fi

        # Process regex sub
        fNew=$''$(printf '%s' $''$pdf'' | gsed -E 's/^([^\ ]*) ?(\(?[A-z_\.\-]*\)?)(([_\-\ ]*)(PowerShell[^\ ]*)?(.+)[_\-\ ]*Microsoft.+)(\.pdf)$/\1.pdf/g')''
        fErr="Failed to rename '${pdf}' → '${fNew}'"

        # Check if filename is shorter than the original
        # No reason to rename if it's just making the filename worse
        if (( ${#fNew} < ${#pdf} )); then
            log "Rename: $pdf → $fNew"

            # Check if dest exists
            if [[ -f $''$fNew'' ]]; then
                log 1 "${fErr}, destination exists" || return $?
            fi

            # Rename
            mv $''$pdf'' $''$fNew'' || {
                log 1 "${fErr}, mv failed" || return $?
            }

            # Check if new filename for the PDF exists
            if [[ ! -f $''$fNew'' ]]; then
                if [[ -f $''$pdf'' ]]; then
                    log 1 "${fErr}, original file still exists?" || return $?
                fi

                # Literally how
                log 1 "${fErr}, new filename doesn't exist?" || return $?
            fi

            (( rNum += 1 ))
            filesNew+=($''$fNew'')
        else
            (( sNum += 1 ))
            filesNew+=($''$pdf'')
        fi
    done

    # Print stats for invalid files
    local dNum=$(( ${#files} - rNum - sNum ))
    if (( dNum > 0 )); then
        log "Removed $dNum files from the processing queue"
    fi

    files=($filesNew)
    log "Got ${#files} valid files, renamed ${rNum}, skipped ${sNum}"
 }

 # Function to process OCR for each PDF
 process_ocr() {
    if (( ! DUMP_PDFS_OCR )); then
        return
    fi

    if [[ -f "$pdfO" ]]; then
        log 0 "Skipped processing $pdf, found $pdfO"
        return 0
    fi

    local ocrArgs=(--output-type pdfa-3)
    local ocrArgsClean=(--redo-ocr --clean)
    local ocrArgsFinal=(--deskew --rotate-pages --skip-text --force-ocr)

    local pdf="$1"
    local pdfC="${pdf:r}-clean.pdf"
    local pdfO="${pdf:r}-ocr.pdf"

    log "Processing OCR step (1/2) for $pdf"
    ocrmypdf $ocrArgs $ocrArgsClean $pdf $pdfC

    log "Processing OCR step (2/2) for $pdf"
    ocrmypdf $ocrArgs $ocrArgsFinal $pdfC $pdfO

    mv "$pdfC" "$pdfO"
    log "Finished OCR on $pdfO"
 }

 # Function to process each PDF
 process_pdf() {
    local pdf="$1"
    local txt="${pdf:r}.txt"

    if [[ -f "$txt" ]]; then
        log 0 "Skipped processing $pdf, found $txt"
        return 0
    fi

    log "Extracting text from $pdf..."

    # Extract text from PDF
    pdftotext -layout "$pdf" "$txt" || {
        log "fatal: pdftotext returned $?" || return $?
    }

    # Add paper template if specified
    if (( DUMP_PDFS_PAPER )); then
        log 0 "Adding template, -paper specified"

        # Add APA style reference (placeholder, as we can't generate this automatically)
        print "APA Reference (7th edition):" >"${txt}.temp"
        print "[Insert APA reference here]" >>"${txt}.temp"
        print "" >>"${txt}.temp"

        # Add headers for required sections
        print "1. Summary:" >>"${txt}.temp"
        print "Purpose:" >>"${txt}.temp"
        print "Participants:" >>"${txt}.temp"
        print "Methods:" >>"${txt}.temp"
        print "Results:" >>"${txt}.temp"
        print "Conclusion:" >>"${txt}.temp"
        print "" >>"${txt}.temp"
        print "2. Critical Thinking:" >>"${txt}.temp"
        print "Relevance and Significance:" >>"${txt}.temp"
        print "" >>"${txt}.temp"

        # Append the original extracted text
        cat "$txt" >>"${txt}.temp"

        # Replace original with formatted version
        mv "${txt}.temp" "$txt"
    fi

    log "Extracted PDF contents: $pdf → $txt"
 }

 # Some PDFs contain these due to the way that they were rendered.
 # Cleanup line endings, used to remove the following:
 # - 0x0c (\f)
 # - 0x0d 0x0a → 0x0a (\r\n → \n)
 process_fmt() {
    local pdf=$''$1''
    local txt=$''${pdf:r}.txt''

    if [[ ! -f $''$txt'' ]]; then
        log 1 "Failed to cleanup PDF, no such file: $txt" || return $?
    fi

    # Check for existence of 0x0c, no need to run dos2unix otherwise
    # We use this instead of (rg $'\x0c' -q) because it's twice as fast somehow
    #xxd -c 1 -p $''$txt'' | rg '^0c$' -q || {
    #    log $? "Skipped cleaning up $txt"
    #    return 0
    #}

    # -q will return 0 if matched, 1 if not matched, otherwise error
    rg $'\x0c' -q $''$txt'' || {
        local sig=$?

        if (( sig != 1 )); then
            log $sig "Failed cleanup check, rg returned $sig" || return $?
        fi

        log 0 "Skipped cleaning up $txt"
        return 0
    }

    log 0 "Cleaning up line endings in $txt..."
    dos2unix "$txt" &>/dev/null || {
        log $? "fatal: dos2unix returned $?" || return $?
    }

    log "Deleting 0x0c (form feed) characters from $txt..."
    tr -d '\f' <"$txt" >"${txt}.temp"
    mv "${txt}.temp" "$txt"

    log "Finished cleaning up: $txt"
 }

 local run_proc_single() {
    local pdfC="$pdf"
    local pdfO="${pdf:r}-ocr.pdf"

    process_ocr "$pdfC" || return $?

    if [[ -f "$pdfO" ]]; then
        log 0 "Using OCR'd version of $pdfC instead"
        pdfC="$pdfO"
    fi

    process_pdf "$pdfC" || return $?
    process_fmt "$pdfC" || return $?
 }

 local run () {
    # Pre-process steps
    parse_pkgs || return $?
    parse_args || return $?

    # Batch process rename
    process_rename || return $?

    # Process each PDF
    for pdf in $files; do
        run_proc_single || {
            log $? "fatal: [$?] processing $pdf" || return $?
        }
    done

    # Cleanup
    clear_args
    log "All PDFs processed."
 }

 {
    run || {
        log $? "Processing exited with an error" || exit $?
    }
 } 2>&1 | ts $tsArg
	dump-pdfs-recent () {
	local loc="$(whence -p fd)" \|\| return $?
	local arg=(-j 16 -u -e pdf --changed-within)
	local arq=(-q -1)
	local acw="$1"

	if (( ! ${#acw} )); then
	print "Usage: dump-pdfs-recent [time (3m \| 1h)] [fd args]..."
	return 64
	fi

	# Remove --changed-within timestamp from rest of args for fd
	shift

	# Check for files first
	fd $arg $acw $arq $@ \|\| {
	print "fatal: Didn't find any valid PDF files to process!"
	return 127
	}

	# Exec dump-pdfs
	fd $arg $acw $@ -X dump-pdfs -rename $aff
	}
	#!/usr/bin/env zsh -opipefail

	local clear_args() {
	unset DUMP_PDFS_ARG DUMP_PDFS_PDF DUMP_PDFS_OCR DUMP_PDFS_DEBUG DUMP_PDFS_PAPER DUMP_PDFS_RENAME
	}

	local pdf
	local tsArg=(-s "%.T")
	local files=()

	clear_args
	DUMP_PDFS_ARG=("$@")
	DUMP_PDFS_PDF=()
	DUMP_PDFS_OCR=0
	DUMP_PDFS_DEBUG=0
	DUMP_PDFS_PAPER=0
	DUMP_PDFS_RENAME=0

	# Usage: log [message]
	# log [exit code] [message] [print args]
	# log 0 "this is a debug log"
	# log 2 "this is a usage err"
	# log "this is a normal log"
	local log() {
	local msg="[0] # $1"

	# If an exit code is given explicitly
	if (( ${#2} )); then
	local sig="$1"
	local msg="[$sig] # $2"
	shift 2

	# Print to stderr & return exit code
	if (( sig > 0 )); then
	print >&2 $@ "$msg"
	return $sig
	fi

	# Got an exit code <= 0, print debug logs
	if (( DUMP_PDFS_DEBUG )); then
	print $@ "$msg"
	fi

	return 0
	fi

	shift
	print $@ "$msg"
	}

	# Ensure we have necessary tools
	local parse_pkgs () {
	command -v pdftotext >/dev/null 2>&1 \|\| {
	log 1 "pdftotext not found. Please install poppler-utils." \|\| return $?
	}
	command -v ocrmypdf >/dev/null 2>&1 \|\| {
	log 1 "ocrmypdf not found. Please install pdftotext." \|\| return $?
	}
	}

	# Parse provided user args
	local parse_args () {
	for arg in "${DUMP_PDFS_ARG[@]}"; do
	case $arg in;
	"-paper")
	DUMP_PDFS_PAPER=1
	;;
	"-rename")
	DUMP_PDFS_RENAME=1
	;;
	"-ocr")
	DUMP_PDFS_OCR=1
	;;
	"-debug")
	DUMP_PDFS_DEBUG=1
	;;
	*)
	files+=("$arg");;
	esac
	done

	# Parse file args
	if (( ! ${#files} )); then
	log "No files specified, defaulting to all"
	files+=(*.pdf)
	log "Proceesing ${#files} files"
	fi

	# Source zshcmds for parr command
	if (( debug )); then
	source ~/.zshcmds

	log 0 "=== Processed args ==="
	log 0 "Files [${#files}]: $(cat - < <(parr files 2>&1))"
	log 0 "=== Processed args ==="
	fi
	}

	# Fix filenames (batched, not individual)
	#
	# ^([^ ])([A-z_\. \-\(\)])( ?(_\|-) Microsoft.+)(\.pdf)$
	# ^([^\ ]) ?(\(?[A-z_\.\-]\)?)(([_\-\ ])(PowerShell[^\ ])?(.+)[_\-\ ]*Microsoft.+)(\.pdf)$
	local process_rename () {
	if (( ! DUMP_PDFS_RENAME )); then
	log 0 "Skipped renaming files"
	return 0
	fi

	local filesNew=()
	local fRgx="Microsoft ?Learn.pdf"
	local rNum=0
	local sNum=0

	log "Processing batch rename for ${#files} files..."
	for pdf in $files; do
	local fNew=$''$pdf''

	# Skip adding file to the process queue
	if [[ "${pdf:e}" != "pdf" ]]; then
	log 1 "File isn't a PDF, skipped: '${pdf}'"
	continue
	fi

	# Skip adding non-existent files to the process queue
	if [[ ! -f $''$pdf'' ]]; then
	log 1 "File nonexistent, skipped: '${pdf}'"
	continue
	fi

	# Process rename only if it matches cleanup regex
	if [[ ! ( $''$pdf'' =~ $fRgx ) ]]; then
	log 0 "Regex notmatch, skipped rename: $pdf"
	(( sNum += 1 ))
	filesNew+=($''$pdf'')
	continue
	fi

	# Process regex sub
	fNew=$''$(printf '%s' $''$pdf'' \| gsed -E 's/^([^\ ]) ?(\(?[A-z_\.\-]\)?)(([_\-\ ])(PowerShell[^\ ])?(.+)[_\-\ ]*Microsoft.+)(\.pdf)$/\1.pdf/g')''
	fErr="Failed to rename '${pdf}' → '${fNew}'"

	# Check if filename is shorter than the original
	# No reason to rename if it's just making the filename worse
	if (( ${#fNew} < ${#pdf} )); then
	log "Rename: $pdf → $fNew"

	# Check if dest exists
	if [[ -f $''$fNew'' ]]; then
	log 1 "${fErr}, destination exists" \|\| return $?
	fi

	# Rename
	mv $''$pdf'' $''$fNew'' \|\| {
	log 1 "${fErr}, mv failed" \|\| return $?
	}

	# Check if new filename for the PDF exists
	if [[ ! -f $''$fNew'' ]]; then
	if [[ -f $''$pdf'' ]]; then
	log 1 "${fErr}, original file still exists?" \|\| return $?
	fi

	# Literally how
	log 1 "${fErr}, new filename doesn't exist?" \|\| return $?
	fi

	(( rNum += 1 ))
	filesNew+=($''$fNew'')
	else
	(( sNum += 1 ))
	filesNew+=($''$pdf'')
	fi
	done

	# Print stats for invalid files
	local dNum=$(( ${#files} - rNum - sNum ))
	if (( dNum > 0 )); then
	log "Removed $dNum files from the processing queue"
	fi

	files=($filesNew)
	log "Got ${#files} valid files, renamed ${rNum}, skipped ${sNum}"
	}

	# Function to process OCR for each PDF
	process_ocr() {
	if (( ! DUMP_PDFS_OCR )); then
	return
	fi

	if [[ -f "$pdfO" ]]; then
	log 0 "Skipped processing $pdf, found $pdfO"
	return 0
	fi

	local ocrArgs=(--output-type pdfa-3)
	local ocrArgsClean=(--redo-ocr --clean)
	local ocrArgsFinal=(--deskew --rotate-pages --skip-text --force-ocr)

	local pdf="$1"
	local pdfC="${pdf:r}-clean.pdf"
	local pdfO="${pdf:r}-ocr.pdf"

	log "Processing OCR step (1/2) for $pdf"
	ocrmypdf $ocrArgs $ocrArgsClean $pdf $pdfC

	log "Processing OCR step (2/2) for $pdf"
	ocrmypdf $ocrArgs $ocrArgsFinal $pdfC $pdfO

	mv "$pdfC" "$pdfO"
	log "Finished OCR on $pdfO"
	}

	# Function to process each PDF
	process_pdf() {
	local pdf="$1"
	local txt="${pdf:r}.txt"

	if [[ -f "$txt" ]]; then
	log 0 "Skipped processing $pdf, found $txt"
	return 0
	fi

	log "Extracting text from $pdf..."

	# Extract text from PDF
	pdftotext -layout "$pdf" "$txt" \|\| {
	log "fatal: pdftotext returned $?" \|\| return $?
	}

	# Add paper template if specified
	if (( DUMP_PDFS_PAPER )); then
	log 0 "Adding template, -paper specified"

	# Add APA style reference (placeholder, as we can't generate this automatically)
	print "APA Reference (7th edition):" >"${txt}.temp"
	print "[Insert APA reference here]" >>"${txt}.temp"
	print "" >>"${txt}.temp"

	# Add headers for required sections
	print "1. Summary:" >>"${txt}.temp"
	print "Purpose:" >>"${txt}.temp"
	print "Participants:" >>"${txt}.temp"
	print "Methods:" >>"${txt}.temp"
	print "Results:" >>"${txt}.temp"
	print "Conclusion:" >>"${txt}.temp"
	print "" >>"${txt}.temp"
	print "2. Critical Thinking:" >>"${txt}.temp"
	print "Relevance and Significance:" >>"${txt}.temp"
	print "" >>"${txt}.temp"

	# Append the original extracted text
	cat "$txt" >>"${txt}.temp"

	# Replace original with formatted version
	mv "${txt}.temp" "$txt"
	fi

	log "Extracted PDF contents: $pdf → $txt"
	}

	# Some PDFs contain these due to the way that they were rendered.
	# Cleanup line endings, used to remove the following:
	# - 0x0c (\f)
	# - 0x0d 0x0a → 0x0a (\r\n → \n)
	process_fmt() {
	local pdf=$''$1''
	local txt=$''${pdf:r}.txt''

	if [[ ! -f $''$txt'' ]]; then
	log 1 "Failed to cleanup PDF, no such file: $txt" \|\| return $?
	fi

	# Check for existence of 0x0c, no need to run dos2unix otherwise
	# We use this instead of (rg $'\x0c' -q) because it's twice as fast somehow
	#xxd -c 1 -p $''$txt'' \| rg '^0c$' -q \|\| {
	# log $? "Skipped cleaning up $txt"
	# return 0
	#}

	# -q will return 0 if matched, 1 if not matched, otherwise error
	rg $'\x0c' -q $''$txt'' \|\| {
	local sig=$?

	if (( sig != 1 )); then
	log $sig "Failed cleanup check, rg returned $sig" \|\| return $?
	fi

	log 0 "Skipped cleaning up $txt"
	return 0
	}

	log 0 "Cleaning up line endings in $txt..."
	dos2unix "$txt" &>/dev/null \|\| {
	log $? "fatal: dos2unix returned $?" \|\| return $?
	}

	log "Deleting 0x0c (form feed) characters from $txt..."
	tr -d '\f' <"$txt" >"${txt}.temp"
	mv "${txt}.temp" "$txt"

	log "Finished cleaning up: $txt"
	}

	local run_proc_single() {
	local pdfC="$pdf"
	local pdfO="${pdf:r}-ocr.pdf"

	process_ocr "$pdfC" \|\| return $?

	if [[ -f "$pdfO" ]]; then
	log 0 "Using OCR'd version of $pdfC instead"
	pdfC="$pdfO"
	fi

	process_pdf "$pdfC" \|\| return $?
	process_fmt "$pdfC" \|\| return $?
	}

	local run () {
	# Pre-process steps
	parse_pkgs \|\| return $?
	parse_args \|\| return $?

	# Batch process rename
	process_rename \|\| return $?

	# Process each PDF
	for pdf in $files; do
	run_proc_single \|\| {
	log $? "fatal: [$?] processing $pdf" \|\| return $?
	}
	done

	# Cleanup
	clear_args
	log "All PDFs processed."
	}

	{
	run \|\| {
	log $? "Processing exited with an error" \|\| exit $?
	}
	} 2>&1 \| ts $tsArg