Skip to content

Instantly share code, notes, and snippets.

@jaggzh
Last active April 16, 2024 01:25
Show Gist options
  • Save jaggzh/68ade8171e8d7983f63d3017762699e1 to your computer and use it in GitHub Desktop.
Save jaggzh/68ade8171e8d7983f63d3017762699e1 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
# gist-paste -u https://gist.github.com/jaggzh/68ade8171e8d7983f63d3017762699e1 rewrap-paragraphs
use v5.36;
use strict;
use warnings;
use Getopt::Long;
use Lingua::EN::Sentence qw(get_sentences add_acronyms);
use utf8;
use bansi;
binmode(*STDIN, ":encoding(utf-8)");
binmode(*STDOUT, ":encoding(utf-8)");
binmode(*STDERR, ":encoding(utf-8)");
my $max_word_count = 256;
my $char_limit = 1000;
my $verbosity = 0;
my $opt_help;
my $opt_join;
my @files;
$| = 1;
sub usage {
print <<~"EOT";
Unwraps LFLF separated wrapped-paragraphs
Options:
-w # / --word-count # Max word count ($max_word_count)
-c # / --char-limit # Max char count ($char_limit)
-j # / --join-all Join all lines first (don't keep nl nl paragraph bounds)
-v / --verbose Increment verbosity
-h / --help Me!
EOT
}
GetOptions(
'w|word-count=i' => \$max_word_count,
'c|char-limit=i' => \$char_limit,
'v|verbose+' => \$verbosity, # Incremental verbosity
'h|help' => \$opt_help,
'j|join-all' => \$opt_join,
'<>' => sub { push @files, $_[0]; },
);
usage(), exit if $opt_help;
sub read_input {
my @lines;
if (@files) {
foreach my $file (@files) {
open my $fh, '<:encoding(utf-8)', $file or die "Can't open $file: $!";
push @lines, <$fh>;
close $fh;
}
} else {
@lines = <STDIN>;
}
return @lines;
}
sub has_sent_end {
my $s=$_[0];
if ($s =~ /[?!:]$/) { return 1; }
if ($s =~ /\.$/) { # ends with a period
if ($s !~ /\.\w\.$/) { # might be some abbreviation like "e.g.", which is not the end of a sentence really.
return 1;
}
}
return 0;
}
sub seems_not_sent_start {
my $s=$_[0];
return $s =~ /^-?[a-z]/;
}
sub join_appropriate_lines {
my @ls = @_;
my @newls;
my $current_line = "";
for (my $linei=0; $linei < $#ls+1; $linei++) {
my $line = $ls[$linei];
debug_print(5, "[$linei] Processing line: $line");
if ($line =~ /^\s*$/) {
if ($current_line ne "") {
if (!has_sent_end($current_line) && $linei < $#ls && seems_not_sent_start($ls[$linei+1])) {
$current_line =~ s/-$//; # likely hyphenated
$line = $ls[$linei+1];
$line =~ s/^-[a-z]//; # likely hyphenated
$linei++;
$current_line .= ' ' . $line;
} else {
push(@newls, $current_line);
debug_print(5, "New paragraph: $current_line");
$current_line = ""; # Reset for the next set of lines
}
}
} else {
$current_line .= ' ' . $line;
}
}
push(@newls, $current_line) if $current_line ne ""; # Add the last accumulated line
@newls = map { s/^\s+|\s+$//g; $_; } @newls; # Trim leading and trailing whitespace
return @newls;
}
sub debug_print {
my ($level, $message) = @_;
say STDERR $message if $verbosity >= $level;
}
sub word_count {
my $text = shift;
my @words = split(/\s+/, $text);
return scalar @words;
}
sub split_at_punctuation {
my ($sentence, $max_chars) = @_;
my @punctuations = (';', ',', ')', '—', '–', '-'); # Include UTF-8 dashes
foreach my $punc (@punctuations) {
if ($sentence =~ /(\Q$punc\E)/) {
my $index = index($sentence, $punc) + 1;
if ($index < $max_chars) {
return (substr($sentence, 0, $index), substr($sentence, $index + 1));
}
}
}
return undef;
}
sub wrap_text {
my ($text, $max_words, $max_chars) = @_;
my @wrapped_lines;
my $sentences = get_sentences($text);
my $current_line = '';
my $current_word_count = 0;
debug_print(2, "$bgblu$yel Beginning text wrapping. $rst");
debug_print(3, " {{$yel$text$rst}}");
foreach my $sentence (@$sentences) {
my $sentence_word_count = word_count($sentence);
debug_print(3, " $whi$sentence_word_count <= $max_words && ", length($sentence), " <= $max_chars$rst");
if ($current_word_count + $sentence_word_count <= $max_words && length($current_line . ' ' . $sentence) <= $max_chars) {
debug_print(3, " ${bbla}Appending sentence within limits: ${rst}$sentence");
$current_line .= ($current_line ? ' ' : '') . $sentence;
$current_word_count += $sentence_word_count;
} else {
if ($current_line) {
debug_print(3, " ${gre}FINAL Pushing current line: ${rst}$current_line");
push @wrapped_lines, $current_line;
}
if ($sentence_word_count <= $max_words && length($sentence) <= $max_chars) {
$current_line = $sentence;
$current_word_count = $sentence_word_count;
} else {
debug_print(3, " ${bbla}Sentence exceeds limits, attempting to split.$rst");
my ($first_part, $remaining) = split_at_punctuation($sentence, $max_chars);
if (defined $first_part) {
debug_print(3, " ${gre}FINAL Split at punctuation: ${rst}$first_part");
push @wrapped_lines, $first_part;
$current_line = $remaining // '';
$current_word_count = word_count($current_line);
} else {
# Directly split long sentences exceeding both limits
my $cut = substr($sentence, 0, $max_chars);
my $breakpoint = rindex($cut, ' ');
if ($breakpoint != -1) {
$current_line = substr($sentence, 0, $breakpoint);
$sentence = substr($sentence, $breakpoint + 1);
} else {
$current_line = substr($sentence, 0, $max_chars);
$sentence = substr($sentence, $max_chars);
}
debug_print(3, " ${gre}FINAL Forced split: ${rst}$current_line");
push @wrapped_lines, $current_line;
$current_line = $sentence;
$current_word_count = word_count($current_line);
}
}
}
}
push @wrapped_lines, $current_line if $current_line ne '';
debug_print(3, " ${gre}Final wrapped lines complete.$rst");
for my $t (@wrapped_lines) {
debug_print(3, " {{${bgre}$t$rst}}");
}
return @wrapped_lines;
}
sub process_input {
my @lines = read_input();
@lines = map { s/^\s+//; s/\s+$//; $_; } @lines;
my @paragraphs = join_appropriate_lines(@lines);
if ($opt_join) {
@paragraphs = (join(' ', @paragraphs));
}
foreach my $paragraph (@paragraphs) {
next if $paragraph =~ /^\s*$/;
$paragraph =~ s/^\s+//s;
$paragraph =~ s/\s+$//s;
my @wrapped = wrap_text($paragraph, $max_word_count, $char_limit);
foreach my $line (@wrapped) {
print "$line\n";
}
# print "\n"; # Separate paragraphs
}
}
process_input();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment