nerun · May 17, 2025 17:34 · nerun · Aug 1, 2023
diff --git a/paragrapher.sh b/paragrapher.sh
 #!/bin/zsh
 ################################################################################
 # PARAGRAPHER revision 20
 #     By Daniel "Nerun" Rodrigues
 #     May 17, 2025
 #     https://gist.github.com/nerun/8318924aa35f3f27231f86468804cc8c
 #
 # The purpose of this script is to analyze plain text files (with or without the
 # ".txt" extension) looking for broken paragraphs, i.e., paragraphs splited in
 # more than one line, and join them in a single very long line.
 #
 # This program is free software; you can redistribute it and/or modify it under
 # the terms of the Creative Commons Zero 1.0 Universal (CC0 1.0) Public Domain
 # Dedication (https://creativecommons.org/publicdomain/zero/1.0/).
 ################################################################################
 if [[ -z "$*" ]] || [[ "$1" = "-h" ]] || [[ "$1" = "--help" ]]; then
    cat << EOF
 Paragrapher processes plain text files to fix broken paragraphs — lines split
 where they shouldn’t be — and joins them into single, long lines.

 Usage: paragrapher [OPTION] <FILE>

  -c, --columns    Set minimum line length to detect paragraphs. Default is 72
                   columns. Use lower values (50 or 60) for shorter paragraphs.
                   Example: paragrapher -c 60 filename

  -h, --help       Show this help message.

  -v, --version    Show version, author, URL, and license info.

 The output is always saved as filename_paragraphed.

 Works well with Markdown — recognizes headings and lists. Run it directly on
 ".md" files: paragrapher filename.md.
 EOF
    exit 0
 elif [[ "$1" = "-v" || "$1" = "--version" ]]; then
    cat << EOF
 PARAGRAPHER revision 20 (May 17th, 2025)
 By Daniel "Nerun" Rodrigues
 https://gist.github.com/nerun/8318924aa35f3f27231f86468804cc8c

 This program is free software; you can redistribute it and/or modify it under
 the terms of the Creative Commons Zero 1.0 Universal (CC0 1.0) Public Domain
 Dedication (https://creativecommons.org/publicdomain/zero/1.0/).
 EOF
    exit 0
 elif [[ "$1" = "-c" || "$1" = "--columns" ]]; then
    if [[ -n "$2" && "$2" == <-> && -n "$3" ]]; then
        columns=$2
        filename="$3"
    else
        echo "Error: incorrect usage. Example: paragrapher -c 72 file.txt"
        exit 1
    fi
 else
    columns=72
    filename="$1"
 fi

 if [[ -z "$filename" ]]; then
    echo "Error: no input file specified."
    exit 1
 fi

 # Check if file exists
 if [[ ! -f "$filename" ]]; then
    echo "Error: there's no file \"$filename\"."
    exit 1
 else
    # Check if file is a plain text file
    file_test=$(file -b "$filename")
    echo "$file_test"

    if [[ "$file_test" != *"text"* ]]; then
        echo "Error: \"$filename\" is not a plain text file."
        echo "(detected: \"$file_test\")."
        exit 1
    else
        # Check if it is UTF-8
        if [[ "$file_test" != *"UTF-8"* || "$file_test" == *"with CRLF line terminators"* ]]; then
            if command -v dos2unix >/dev/null 2>&1; then
                dos2unix "$filename"
            else
                echo "Consider installing \"dos2unix\"."
            fi
        fi
    fi
 fi

 # Verify filename extension, if any
 if [[ "$filename" == *"."* ]]; then
    ext="${filename##*.}"
    ext="${ext:l}"
    output="${filename%.*}_paragraphed.$ext"
 else # filename has no extension
    ext="txt"
    output="${filename}_paragraphed.$ext"
 fi

 # Add an empty line to the end of the file to avoid paragrapher not reading the
 # last line
 last_line=$(tail -n 1 "$filename")

 if [[ -n "$last_line" ]]; then
    echo "" >> "$filename"
 fi

 # Create or clean the output file
 :> $output

 paragraph=''

 _dump_paragraph() {
    if [[ -n "$paragraph" ]]; then
        echo "$paragraph" >> "$output"
        echo "" >> "$output"
        paragraph=''
    fi
 }

 # https://mywiki.wooledge.org/BashFAQ/001#Trimming
 # while IFS= read -r line; do
 while read -r line; do # reads file filename, then, for each line it does:
    size=${#line} # get the length of the line

    # if the paragraph is empty, and in the first iteration it will be, then do
    if [[ -z "$paragraph" ]]; then
        # if the line length is greater than the columns variable
        if [[ $size -ge $columns ]]; then
            paragraph="$line"
        else # 0 <= size <= 71
           echo "$line" >> $output
        fi
    # is not the 1st iteration, paragraph already has content
    else
        # If the last character of paragraph does not match a period (.) or a
        # colon (:), execute the block.
        last_char="${paragraph: -1}"

        if [[ "$last_char" != "." && "$last_char" != ":" ]]; then
            if [[ "$ext" = "md" ]]; then
                case "${line:0:7}" in
                    \#\ * | \
                    \#\#\ * | \
                    \#\#\#\ * | \
                    \#\#\#\#\ * | \
                    \#\#\#\#\#\ * | \
                    \#\#\#\#\#\#\ * | \
                    "•"[$'\t ']* | \
                    "-"[$'\t ']* )
                        _dump_paragraph

                        if [[ "${line:0:1}" = "#" ]]; then
                            echo "$line" >> $output
                            echo "" >> $output
                        else
                            paragraph=''
                            paragraph+="$line"
                        fi
                        ;;
                    * )
                        paragraph+=" $line"
                        ;;
                esac
            else
                paragraph+=" $line"
            fi
        else
            _dump_paragraph

            # if the line length is greater than the columns variable
            if [[ $size -ge $columns ]]; then
                # concatenate the new line with the previous ones, creating a
                # paragraph
                paragraph+="$line"
            else # 0 <= size <= 71
                # if not greater, just write it to file
                if [[ $size -ne 0 ]]; then
                   echo "$line" >> $output
                fi
            fi
        fi
    fi
 done < "$filename"

 # Clean output file
 sed -ri "s/’|‘/'/g" $output
 sed -ri "s/“|”/\"/g" $output
 sed -i s/$'\u000c'//g $output # Form feed (U+000C)
 sed -i "s/ -- / – /g" $output # En Dash (U+2013)
 sed -i '/ --[[:digit:]]/ s/--/–/g' $output # En Dash (U+2013) representing a minus sign
 if [[ "$ext" = "md" ]]; then
    sed -i 's/^[•-] [ \t]*/  - /g' $output # Bullet (U+2022) or list
 fi
 echo "" >> $output # i like to end files with empty line
	#!/bin/zsh
	################################################################################
	# PARAGRAPHER revision 20
	# By Daniel "Nerun" Rodrigues
	# May 17, 2025
	# https://gist.github.com/nerun/8318924aa35f3f27231f86468804cc8c
	#
	# The purpose of this script is to analyze plain text files (with or without the
	# ".txt" extension) looking for broken paragraphs, i.e., paragraphs splited in
	# more than one line, and join them in a single very long line.
	#
	# This program is free software; you can redistribute it and/or modify it under
	# the terms of the Creative Commons Zero 1.0 Universal (CC0 1.0) Public Domain
	# Dedication (https://creativecommons.org/publicdomain/zero/1.0/).
	################################################################################
	if [[ -z "$*" ]] \|\| [[ "$1" = "-h" ]] \|\| [[ "$1" = "--help" ]]; then
	cat << EOF
	Paragrapher processes plain text files to fix broken paragraphs — lines split
	where they shouldn’t be — and joins them into single, long lines.

	Usage: paragrapher [OPTION] <FILE>

	-c, --columns Set minimum line length to detect paragraphs. Default is 72
	columns. Use lower values (50 or 60) for shorter paragraphs.
	Example: paragrapher -c 60 filename

	-h, --help Show this help message.

	-v, --version Show version, author, URL, and license info.

	The output is always saved as filename_paragraphed.

	Works well with Markdown — recognizes headings and lists. Run it directly on
	".md" files: paragrapher filename.md.
	EOF
	exit 0
	elif [[ "$1" = "-v" \|\| "$1" = "--version" ]]; then
	cat << EOF
	PARAGRAPHER revision 20 (May 17th, 2025)
	By Daniel "Nerun" Rodrigues
	https://gist.github.com/nerun/8318924aa35f3f27231f86468804cc8c

	This program is free software; you can redistribute it and/or modify it under
	the terms of the Creative Commons Zero 1.0 Universal (CC0 1.0) Public Domain
	Dedication (https://creativecommons.org/publicdomain/zero/1.0/).
	EOF
	exit 0
	elif [[ "$1" = "-c" \|\| "$1" = "--columns" ]]; then
	if [[ -n "$2" && "$2" == <-> && -n "$3" ]]; then
	columns=$2
	filename="$3"
	else
	echo "Error: incorrect usage. Example: paragrapher -c 72 file.txt"
	exit 1
	fi
	else
	columns=72
	filename="$1"
	fi

	if [[ -z "$filename" ]]; then
	echo "Error: no input file specified."
	exit 1
	fi

	# Check if file exists
	if [[ ! -f "$filename" ]]; then
	echo "Error: there's no file \"$filename\"."
	exit 1
	else
	# Check if file is a plain text file
	file_test=$(file -b "$filename")
	echo "$file_test"

	if [[ "$file_test" != "text" ]]; then
	echo "Error: \"$filename\" is not a plain text file."
	echo "(detected: \"$file_test\")."
	exit 1
	else
	# Check if it is UTF-8
	if [[ "$file_test" != "UTF-8" \|\| "$file_test" == "with CRLF line terminators" ]]; then
	if command -v dos2unix >/dev/null 2>&1; then
	dos2unix "$filename"
	else
	echo "Consider installing \"dos2unix\"."
	fi
	fi
	fi
	fi

	# Verify filename extension, if any
	if [[ "$filename" == "." ]]; then
	ext="${filename##*.}"
	ext="${ext:l}"
	output="${filename%.*}_paragraphed.$ext"
	else # filename has no extension
	ext="txt"
	output="${filename}_paragraphed.$ext"
	fi

	# Add an empty line to the end of the file to avoid paragrapher not reading the
	# last line
	last_line=$(tail -n 1 "$filename")

	if [[ -n "$last_line" ]]; then
	echo "" >> "$filename"
	fi

	# Create or clean the output file
	:> $output

	paragraph=''

	_dump_paragraph() {
	if [[ -n "$paragraph" ]]; then
	echo "$paragraph" >> "$output"
	echo "" >> "$output"
	paragraph=''
	fi
	}

	# https://mywiki.wooledge.org/BashFAQ/001#Trimming
	# while IFS= read -r line; do
	while read -r line; do # reads file filename, then, for each line it does:
	size=${#line} # get the length of the line

	# if the paragraph is empty, and in the first iteration it will be, then do
	if [[ -z "$paragraph" ]]; then
	# if the line length is greater than the columns variable
	if [[ $size -ge $columns ]]; then
	paragraph="$line"
	else # 0 <= size <= 71
	echo "$line" >> $output
	fi
	# is not the 1st iteration, paragraph already has content
	else
	# If the last character of paragraph does not match a period (.) or a
	# colon (:), execute the block.
	last_char="${paragraph: -1}"

	if [[ "$last_char" != "." && "$last_char" != ":" ]]; then
	if [[ "$ext" = "md" ]]; then
	case "${line:0:7}" in
	\#\ * \| \
	\#\#\ * \| \
	\#\#\#\ * \| \
	\#\#\#\#\ * \| \
	\#\#\#\#\#\ * \| \
	\#\#\#\#\#\#\ * \| \
	"•"[$'\t ']* \| \
	"-"[$'\t ']* )
	_dump_paragraph

	if [[ "${line:0:1}" = "#" ]]; then
	echo "$line" >> $output
	echo "" >> $output
	else
	paragraph=''
	paragraph+="$line"
	fi
	;;
	* )
	paragraph+=" $line"
	;;
	esac
	else
	paragraph+=" $line"
	fi
	else
	_dump_paragraph

	# if the line length is greater than the columns variable
	if [[ $size -ge $columns ]]; then
	# concatenate the new line with the previous ones, creating a
	# paragraph
	paragraph+="$line"
	else # 0 <= size <= 71
	# if not greater, just write it to file
	if [[ $size -ne 0 ]]; then
	echo "$line" >> $output
	fi
	fi
	fi
	fi
	done < "$filename"

	# Clean output file
	sed -ri "s/’\|‘/'/g" $output
	sed -ri "s/“\|”/\"/g" $output
	sed -i s/$'\u000c'//g $output # Form feed (U+000C)
	sed -i "s/ -- / – /g" $output # En Dash (U+2013)
	sed -i '/ --[[:digit:]]/ s/--/–/g' $output # En Dash (U+2013) representing a minus sign
	if [[ "$ext" = "md" ]]; then
	sed -i 's/^[•-] [ \t]*/ - /g' $output # Bullet (U+2022) or list
	fi
	echo "" >> $output # i like to end files with empty line