Last active
April 16, 2024 01:25
-
-
Save jaggzh/68ade8171e8d7983f63d3017762699e1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# gist-paste -u https://gist.github.com/jaggzh/68ade8171e8d7983f63d3017762699e1 rewrap-paragraphs | |
use v5.36; | |
use strict; | |
use warnings; | |
use Getopt::Long; | |
use Lingua::EN::Sentence qw(get_sentences add_acronyms); | |
use utf8; | |
use bansi; | |
binmode(*STDIN, ":encoding(utf-8)"); | |
binmode(*STDOUT, ":encoding(utf-8)"); | |
binmode(*STDERR, ":encoding(utf-8)"); | |
my $max_word_count = 256; | |
my $char_limit = 1000; | |
my $verbosity = 0; | |
my $opt_help; | |
my $opt_join; | |
my @files; | |
$| = 1; | |
sub usage { | |
print <<~"EOT"; | |
Unwraps LFLF separated wrapped-paragraphs | |
Options: | |
-w # / --word-count # Max word count ($max_word_count) | |
-c # / --char-limit # Max char count ($char_limit) | |
-j # / --join-all Join all lines first (don't keep nl nl paragraph bounds) | |
-v / --verbose Increment verbosity | |
-h / --help Me! | |
EOT | |
} | |
GetOptions( | |
'w|word-count=i' => \$max_word_count, | |
'c|char-limit=i' => \$char_limit, | |
'v|verbose+' => \$verbosity, # Incremental verbosity | |
'h|help' => \$opt_help, | |
'j|join-all' => \$opt_join, | |
'<>' => sub { push @files, $_[0]; }, | |
); | |
usage(), exit if $opt_help; | |
sub read_input { | |
my @lines; | |
if (@files) { | |
foreach my $file (@files) { | |
open my $fh, '<:encoding(utf-8)', $file or die "Can't open $file: $!"; | |
push @lines, <$fh>; | |
close $fh; | |
} | |
} else { | |
@lines = <STDIN>; | |
} | |
return @lines; | |
} | |
sub has_sent_end { | |
my $s=$_[0]; | |
if ($s =~ /[?!:]$/) { return 1; } | |
if ($s =~ /\.$/) { # ends with a period | |
if ($s !~ /\.\w\.$/) { # might be some abbreviation like "e.g.", which is not the end of a sentence really. | |
return 1; | |
} | |
} | |
return 0; | |
} | |
sub seems_not_sent_start { | |
my $s=$_[0]; | |
return $s =~ /^-?[a-z]/; | |
} | |
sub join_appropriate_lines { | |
my @ls = @_; | |
my @newls; | |
my $current_line = ""; | |
for (my $linei=0; $linei < $#ls+1; $linei++) { | |
my $line = $ls[$linei]; | |
debug_print(5, "[$linei] Processing line: $line"); | |
if ($line =~ /^\s*$/) { | |
if ($current_line ne "") { | |
if (!has_sent_end($current_line) && $linei < $#ls && seems_not_sent_start($ls[$linei+1])) { | |
$current_line =~ s/-$//; # likely hyphenated | |
$line = $ls[$linei+1]; | |
$line =~ s/^-[a-z]//; # likely hyphenated | |
$linei++; | |
$current_line .= ' ' . $line; | |
} else { | |
push(@newls, $current_line); | |
debug_print(5, "New paragraph: $current_line"); | |
$current_line = ""; # Reset for the next set of lines | |
} | |
} | |
} else { | |
$current_line .= ' ' . $line; | |
} | |
} | |
push(@newls, $current_line) if $current_line ne ""; # Add the last accumulated line | |
@newls = map { s/^\s+|\s+$//g; $_; } @newls; # Trim leading and trailing whitespace | |
return @newls; | |
} | |
sub debug_print { | |
my ($level, $message) = @_; | |
say STDERR $message if $verbosity >= $level; | |
} | |
sub word_count { | |
my $text = shift; | |
my @words = split(/\s+/, $text); | |
return scalar @words; | |
} | |
sub split_at_punctuation { | |
my ($sentence, $max_chars) = @_; | |
my @punctuations = (';', ',', ')', '—', '–', '-'); # Include UTF-8 dashes | |
foreach my $punc (@punctuations) { | |
if ($sentence =~ /(\Q$punc\E)/) { | |
my $index = index($sentence, $punc) + 1; | |
if ($index < $max_chars) { | |
return (substr($sentence, 0, $index), substr($sentence, $index + 1)); | |
} | |
} | |
} | |
return undef; | |
} | |
sub wrap_text { | |
my ($text, $max_words, $max_chars) = @_; | |
my @wrapped_lines; | |
my $sentences = get_sentences($text); | |
my $current_line = ''; | |
my $current_word_count = 0; | |
debug_print(2, "$bgblu$yel Beginning text wrapping. $rst"); | |
debug_print(3, " {{$yel$text$rst}}"); | |
foreach my $sentence (@$sentences) { | |
my $sentence_word_count = word_count($sentence); | |
debug_print(3, " $whi$sentence_word_count <= $max_words && ", length($sentence), " <= $max_chars$rst"); | |
if ($current_word_count + $sentence_word_count <= $max_words && length($current_line . ' ' . $sentence) <= $max_chars) { | |
debug_print(3, " ${bbla}Appending sentence within limits: ${rst}$sentence"); | |
$current_line .= ($current_line ? ' ' : '') . $sentence; | |
$current_word_count += $sentence_word_count; | |
} else { | |
if ($current_line) { | |
debug_print(3, " ${gre}FINAL Pushing current line: ${rst}$current_line"); | |
push @wrapped_lines, $current_line; | |
} | |
if ($sentence_word_count <= $max_words && length($sentence) <= $max_chars) { | |
$current_line = $sentence; | |
$current_word_count = $sentence_word_count; | |
} else { | |
debug_print(3, " ${bbla}Sentence exceeds limits, attempting to split.$rst"); | |
my ($first_part, $remaining) = split_at_punctuation($sentence, $max_chars); | |
if (defined $first_part) { | |
debug_print(3, " ${gre}FINAL Split at punctuation: ${rst}$first_part"); | |
push @wrapped_lines, $first_part; | |
$current_line = $remaining // ''; | |
$current_word_count = word_count($current_line); | |
} else { | |
# Directly split long sentences exceeding both limits | |
my $cut = substr($sentence, 0, $max_chars); | |
my $breakpoint = rindex($cut, ' '); | |
if ($breakpoint != -1) { | |
$current_line = substr($sentence, 0, $breakpoint); | |
$sentence = substr($sentence, $breakpoint + 1); | |
} else { | |
$current_line = substr($sentence, 0, $max_chars); | |
$sentence = substr($sentence, $max_chars); | |
} | |
debug_print(3, " ${gre}FINAL Forced split: ${rst}$current_line"); | |
push @wrapped_lines, $current_line; | |
$current_line = $sentence; | |
$current_word_count = word_count($current_line); | |
} | |
} | |
} | |
} | |
push @wrapped_lines, $current_line if $current_line ne ''; | |
debug_print(3, " ${gre}Final wrapped lines complete.$rst"); | |
for my $t (@wrapped_lines) { | |
debug_print(3, " {{${bgre}$t$rst}}"); | |
} | |
return @wrapped_lines; | |
} | |
sub process_input { | |
my @lines = read_input(); | |
@lines = map { s/^\s+//; s/\s+$//; $_; } @lines; | |
my @paragraphs = join_appropriate_lines(@lines); | |
if ($opt_join) { | |
@paragraphs = (join(' ', @paragraphs)); | |
} | |
foreach my $paragraph (@paragraphs) { | |
next if $paragraph =~ /^\s*$/; | |
$paragraph =~ s/^\s+//s; | |
$paragraph =~ s/\s+$//s; | |
my @wrapped = wrap_text($paragraph, $max_word_count, $char_limit); | |
foreach my $line (@wrapped) { | |
print "$line\n"; | |
} | |
# print "\n"; # Separate paragraphs | |
} | |
} | |
process_input(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment