Last active
June 20, 2024 02:28
-
-
Save nerun/8318924aa35f3f27231f86468804cc8c to your computer and use it in GitHub Desktop.
The purpose of this script is to analyze plain text files (with or without the ".txt" extension) looking for broken paragraphs, i.e., paragraphs splited in more than one line, and join them in a single very long line. It's markdown friendly.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env zsh | |
################################################################################ | |
# PARAGRAPHER revision 19 | |
# By Daniel "Nerun" Rodrigues | |
# June 19, 2024 | |
# https://gist.github.com/nerun/8318924aa35f3f27231f86468804cc8c | |
# | |
# The purpose of this script is to analyze plain text files (with or without the | |
# ".txt" extension) looking for broken paragraphs, i.e., paragraphs splited in | |
# more than one line, and join them in a single very long line. | |
# | |
# This program is free software; you can redistribute it and/or modify it under | |
# the terms of the Creative Commons Zero 1.0 Universal (CC0 1.0) Public Domain | |
# Dedication (https://creativecommons.org/publicdomain/zero/1.0/). | |
################################################################################ | |
if [ -z "$*" ] || [[ "$1" = "-h" ]] || [[ "$1" = "--help" ]]; then | |
echo """The purpose of PARAGRAPHER is to analyze plain text files (with or without the | |
\".txt\" extension) looking for broken paragraphs, i.e., paragraphs splited in | |
more than one line, and join them in a single very long line (a.k.a. paragraph). | |
Usage: paragrapher [OPTION] <FILE> | |
-c, --columns Sets the width of a paragraph. The program will look for all | |
lines with a width greater than this number and consider | |
them to be a paragraph. The default is 72 columns: ideal for | |
paragraphs with more than 72 columns, such as 80 or 100. If | |
your text has smaller paragraphs, try reducing this value to | |
50 or 60. Ex.: \"paragrapher -c 60 filename\". | |
-h, --help Show this help. | |
-v, --version Show version, author, URL and license. | |
The output file will always be 'filename_paragraphed'. | |
Paragrapher is markdown friendly, and can recognize headings and unordered | |
lists. You can use it directly over a markdown file: \"paragrapher filename.md\". | |
""" | |
exit 0 | |
elif [[ "$1" = "-v" || "$1" = "--version" ]]; then | |
echo """PARAGRAPHER revision 17 (August 1st, 2023) | |
By Daniel \"Nerun\" Rodrigues | |
https://gist.github.com/nerun/8318924aa35f3f27231f86468804cc8c | |
This program is free software; you can redistribute it and/or modify it under | |
the terms of the Creative Commons Zero 1.0 Universal (CC0 1.0) Public Domain | |
Dedication (https://creativecommons.org/publicdomain/zero/1.0/). | |
""" | |
exit 0 | |
elif [[ "$1" = "-c" || "$1" = "--columns" ]]; then | |
columns=$2 | |
filename="$3" | |
else | |
columns=72 | |
filename="$*" | |
fi | |
# Check if file exists | |
if [[ ! -f "$filename" ]]; then | |
echo "Error: there's no file \"$filename\"." | |
exit 1 | |
else | |
# Check if file is a plain text file | |
file_test=$(file $filename) | |
echo $file_test | |
if [[ "$file_test" != *"text"* ]]; then | |
echo "Error: \"$filename\" is not a plain text file." | |
exit 1 | |
else | |
# Check if it is UTF-8 | |
if [[ "$file_test" != *"UTF-8"* || "$file_test" == *"with CRLF line terminators"* ]]; then | |
is_dos2unix=$(which dos2unix; echo $?) | |
if [ ${is_dos2unix: -1} -eq 0 ]; then | |
dos2unix $filename | |
else | |
echo "Consider installing \"dos2unix\"." | |
fi | |
fi | |
fi | |
fi | |
# Verify filename extension, if any | |
if [[ "$filename" == *"."* ]]; then | |
ext=$(echo ${filename##*.} | sed 's/.*/\L&/') | |
output="${filename%.*}_paragraphed.$ext" | |
else # filename has no extension | |
ext="txt" | |
output="${filename}_paragraphed.$ext" | |
fi | |
# Add an empty line to the end of the file to avoid paragrapher not reading the | |
# last line | |
last_line=$(tail -1 $filename) | |
if [[ $(expr length "$last_line") != 0 ]]; then | |
echo "" >> $filename | |
fi | |
# Create or clean the output file | |
:> $output | |
paragraph='' | |
# get string length | |
_strSize(){ | |
if [ -n "$1" ]; then | |
echo $(expr length "$1") | |
else | |
echo 0 | |
fi | |
} | |
_dump_paragraph(){ | |
echo "$paragraph" >> $output | |
paragraph='' | |
echo "" >> $output | |
} | |
# https://mywiki.wooledge.org/BashFAQ/001#Trimming | |
# while IFS= read -r line; do | |
while read -r line; do | |
size=$(_strSize "$line") | |
if [ -z "$paragraph" ]; then | |
if [ $size -ge $columns ]; then | |
paragraph="$line" | |
else # 0 <= size <= 71 | |
echo "$line" >> $output | |
fi | |
else # [ -n "$paragraph" ] | |
if [[ ! "${paragraph: -1}" =~ [\.:] ]]; then | |
if [[ "$ext" = "md" ]]; then | |
if [[ ! "${line: 0:2}" =~ (#[# ]|•[ \t]|-[ \t]) ]]; then | |
paragraph+=" $line" | |
else # starts with # (it's a heading) or •/- (unordered list) | |
_dump_paragraph | |
if [[ "${line: 0:1}" = "#" ]]; then | |
echo "$line" >> $output | |
echo "" >> $output | |
else # unordered list (• or -) | |
paragraph='' | |
paragraph+="$line" | |
fi | |
fi | |
else | |
paragraph+=" $line" | |
fi | |
else | |
_dump_paragraph | |
if [ $size -ge $columns ]; then | |
paragraph+="$line" | |
else # 0 <= size <= 71 | |
if [ $size -ne 0 ]; then | |
echo "$line" >> $output | |
fi | |
fi | |
fi | |
fi | |
done < $filename | |
# Clean output file | |
sed -ri "s/’|‘/'/g" $output | |
sed -ri "s/“|”/\"/g" $output | |
sed -i s/$'\u000c'//g $output # Form feed (U+000C) | |
sed -i "s/ -- / – /g" $output # En Dash (U+2013) | |
sed -i '/ --[[:digit:]]/ s/--/–/g' $output # En Dash (U+2013) representing a minus sign | |
if [[ "$ext" = "md" ]]; then | |
sed -i 's/^[•-] [ \t]*/ - /g' $output # Bullet (U+2022) or list | |
fi | |
squeeze_empty=$(/usr/bin/cat -s $output) | |
echo "$squeeze_empty" > $output | |
echo "" >> $output # i like to end files with empty line |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It is fully compatible with BASH 5+. Just change the shebang to
#!/usr/bin/env bash