jaggzh · April 16, 2024 01:25
diff --git a/rewrap-paragraphs b/rewrap-paragraphs
 #!/usr/bin/perl
 # gist-paste -u https://gist.github.com/jaggzh/68ade8171e8d7983f63d3017762699e1 rewrap-paragraphs
 use v5.36;
 use strict;
 use warnings;
 use Getopt::Long;
 use Lingua::EN::Sentence qw(get_sentences add_acronyms);
 use utf8;
 use bansi;

 binmode(*STDIN, ":encoding(utf-8)");
 binmode(*STDOUT, ":encoding(utf-8)");
 binmode(*STDERR, ":encoding(utf-8)");

 my $max_word_count = 256;
 my $char_limit = 1000;
 my $verbosity = 0;
 my $opt_help;
 my $opt_join;
 my @files;

 $| = 1;

 sub usage {
 	print <<~"EOT";
 		Unwraps LFLF separated wrapped-paragraphs
 		Options:
 		    -w # / --word-count #  Max word count ($max_word_count)
 		    -c # / --char-limit #  Max char count ($char_limit)
 		    -j # / --join-all      Join all lines first (don't keep nl nl paragraph bounds)
 		    -v / --verbose         Increment verbosity
 		    -h / --help            Me!
 		EOT
 }

 GetOptions(
    'w|word-count=i' => \$max_word_count,
    'c|char-limit=i' => \$char_limit,
    'v|verbose+' => \$verbosity, # Incremental verbosity
    'h|help' => \$opt_help,
    'j|join-all' => \$opt_join,
    '<>' => sub { push @files, $_[0]; },
 );

 usage(), exit if $opt_help;

 sub read_input {
    my @lines;
    if (@files) {
        foreach my $file (@files) {
            open my $fh, '<:encoding(utf-8)', $file or die "Can't open $file: $!";
            push @lines, <$fh>;
            close $fh;
        }
    } else {
        @lines = <STDIN>;
    }
    return @lines;
 }

 sub has_sent_end {
 	my $s=$_[0];
 	if ($s =~ /[?!:]$/) { return 1; }
 	if ($s =~ /\.$/) {  # ends with a period
 		if ($s !~ /\.\w\.$/) { # might be some abbreviation like "e.g.", which is not the end of a sentence really.
 			return 1;
 		}
 	}
 	return 0;
 }

 sub seems_not_sent_start {
 	my $s=$_[0];
 	return $s =~ /^-?[a-z]/;
 }

 sub join_appropriate_lines {
    my @ls = @_;
    my @newls;
    my $current_line = "";

    for (my $linei=0; $linei < $#ls+1; $linei++) {
    	my $line = $ls[$linei];
        debug_print(5, "[$linei] Processing line: $line");
        if ($line =~ /^\s*$/) {
            if ($current_line ne "") {
 				if (!has_sent_end($current_line) && $linei < $#ls && seems_not_sent_start($ls[$linei+1])) {
 					$current_line =~ s/-$//; # likely hyphenated
 					$line = $ls[$linei+1];
 					$line =~ s/^-[a-z]//; # likely hyphenated
 					$linei++;
 					$current_line .= ' ' . $line;
 				} else {
 					push(@newls, $current_line);
 					debug_print(5, "New paragraph: $current_line");
 					$current_line = ""; # Reset for the next set of lines
 				}
            }
        } else {
            $current_line .= ' ' . $line;
        }
    }

    push(@newls, $current_line) if $current_line ne ""; # Add the last accumulated line
    @newls = map { s/^\s+|\s+$//g; $_; } @newls; # Trim leading and trailing whitespace
    return @newls;
 }

 sub debug_print {
    my ($level, $message) = @_;
    say STDERR $message if $verbosity >= $level;
 }

 sub word_count {
    my $text = shift;
    my @words = split(/\s+/, $text);
    return scalar @words;
 }

 sub split_at_punctuation {
    my ($sentence, $max_chars) = @_;
    my @punctuations = (';', ',', ')', '—', '–', '-'); # Include UTF-8 dashes
    foreach my $punc (@punctuations) {
        if ($sentence =~ /(\Q$punc\E)/) {
            my $index = index($sentence, $punc) + 1;
            if ($index < $max_chars) {
                return (substr($sentence, 0, $index), substr($sentence, $index + 1));
            }
        }
    }
    return undef;
 }

 sub wrap_text {
    my ($text, $max_words, $max_chars) = @_;
    my @wrapped_lines;
    my $sentences = get_sentences($text);
    my $current_line = '';
    my $current_word_count = 0;

    debug_print(2, "$bgblu$yel Beginning text wrapping. $rst");
    debug_print(3, "  {{$yel$text$rst}}");

    foreach my $sentence (@$sentences) {
        my $sentence_word_count = word_count($sentence);
        debug_print(3, "    $whi$sentence_word_count <= $max_words && ", length($sentence), " <= $max_chars$rst");
        if ($current_word_count + $sentence_word_count <= $max_words && length($current_line . ' ' . $sentence) <= $max_chars) {
            debug_print(3, "  ${bbla}Appending sentence within limits: ${rst}$sentence");
            $current_line .= ($current_line ? ' ' : '') . $sentence;
            $current_word_count += $sentence_word_count;
        } else {
            if ($current_line) {
                debug_print(3, "  ${gre}FINAL Pushing current line: ${rst}$current_line");
                push @wrapped_lines, $current_line;
            }
            if ($sentence_word_count <= $max_words && length($sentence) <= $max_chars) {
                $current_line = $sentence;
                $current_word_count = $sentence_word_count;
            } else {
                debug_print(3, "    ${bbla}Sentence exceeds limits, attempting to split.$rst");
                my ($first_part, $remaining) = split_at_punctuation($sentence, $max_chars);
                if (defined $first_part) {
                    debug_print(3, "    ${gre}FINAL Split at punctuation: ${rst}$first_part");
                    push @wrapped_lines, $first_part;
                    $current_line = $remaining // '';
                    $current_word_count = word_count($current_line);
                } else {
                    # Directly split long sentences exceeding both limits
                    my $cut = substr($sentence, 0, $max_chars);
                    my $breakpoint = rindex($cut, ' ');
                    if ($breakpoint != -1) {
                        $current_line = substr($sentence, 0, $breakpoint);
                        $sentence = substr($sentence, $breakpoint + 1);
                    } else {
                        $current_line = substr($sentence, 0, $max_chars);
                        $sentence = substr($sentence, $max_chars);
                    }
                    debug_print(3, "      ${gre}FINAL Forced split: ${rst}$current_line");
                    push @wrapped_lines, $current_line;
                    $current_line = $sentence;
                    $current_word_count = word_count($current_line);
                }
            }
        }
    }

    push @wrapped_lines, $current_line if $current_line ne '';
    debug_print(3, "  ${gre}Final wrapped lines complete.$rst");
    for my $t (@wrapped_lines) {
    	debug_print(3, "    {{${bgre}$t$rst}}");
 	}

    return @wrapped_lines;
 }



 sub process_input {
    my @lines = read_input();
    @lines = map { s/^\s+//; s/\s+$//; $_; } @lines;
    my @paragraphs = join_appropriate_lines(@lines);
    if ($opt_join) {
    	@paragraphs = (join(' ', @paragraphs));
 	}
    foreach my $paragraph (@paragraphs) {
    	next if $paragraph =~ /^\s*$/;
    	$paragraph =~ s/^\s+//s;
    	$paragraph =~ s/\s+$//s;
        my @wrapped = wrap_text($paragraph, $max_word_count, $char_limit);
        foreach my $line (@wrapped) {
            print "$line\n";
        }
        # print "\n"; # Separate paragraphs
    }
 }

 process_input();
	#!/usr/bin/perl
	# gist-paste -u https://gist.github.com/jaggzh/68ade8171e8d7983f63d3017762699e1 rewrap-paragraphs
	use v5.36;
	use strict;
	use warnings;
	use Getopt::Long;
	use Lingua::EN::Sentence qw(get_sentences add_acronyms);
	use utf8;
	use bansi;

	binmode(*STDIN, ":encoding(utf-8)");
	binmode(*STDOUT, ":encoding(utf-8)");
	binmode(*STDERR, ":encoding(utf-8)");

	my $max_word_count = 256;
	my $char_limit = 1000;
	my $verbosity = 0;
	my $opt_help;
	my $opt_join;
	my @files;

	$\| = 1;

	sub usage {
	print <<~"EOT";
	Unwraps LFLF separated wrapped-paragraphs
	Options:
	-w # / --word-count # Max word count ($max_word_count)
	-c # / --char-limit # Max char count ($char_limit)
	-j # / --join-all Join all lines first (don't keep nl nl paragraph bounds)
	-v / --verbose Increment verbosity
	-h / --help Me!
	EOT
	}

	GetOptions(
	'w\|word-count=i' => \$max_word_count,
	'c\|char-limit=i' => \$char_limit,
	'v\|verbose+' => \$verbosity, # Incremental verbosity
	'h\|help' => \$opt_help,
	'j\|join-all' => \$opt_join,
	'<>' => sub { push @files, $_[0]; },
	);

	usage(), exit if $opt_help;

	sub read_input {
	my @lines;
	if (@files) {
	foreach my $file (@files) {
	open my $fh, '<:encoding(utf-8)', $file or die "Can't open $file: $!";
	push @lines, <$fh>;
	close $fh;
	}
	} else {
	@lines = <STDIN>;
	}
	return @lines;
	}

	sub has_sent_end {
	my $s=$_[0];
	if ($s =~ /[?!:]$/) { return 1; }
	if ($s =~ /\.$/) { # ends with a period
	if ($s !~ /\.\w\.$/) { # might be some abbreviation like "e.g.", which is not the end of a sentence really.
	return 1;
	}
	}
	return 0;
	}

	sub seems_not_sent_start {
	my $s=$_[0];
	return $s =~ /^-?[a-z]/;
	}

	sub join_appropriate_lines {
	my @ls = @_;
	my @newls;
	my $current_line = "";

	for (my $linei=0; $linei < $#ls+1; $linei++) {
	my $line = $ls[$linei];
	debug_print(5, "[$linei] Processing line: $line");
	if ($line =~ /^\s*$/) {
	if ($current_line ne "") {
	if (!has_sent_end($current_line) && $linei < $#ls && seems_not_sent_start($ls[$linei+1])) {
	$current_line =~ s/-$//; # likely hyphenated
	$line = $ls[$linei+1];
	$line =~ s/^-[a-z]//; # likely hyphenated
	$linei++;
	$current_line .= ' ' . $line;
	} else {
	push(@newls, $current_line);
	debug_print(5, "New paragraph: $current_line");
	$current_line = ""; # Reset for the next set of lines
	}
	}
	} else {
	$current_line .= ' ' . $line;
	}
	}

	push(@newls, $current_line) if $current_line ne ""; # Add the last accumulated line
	@newls = map { s/^\s+\|\s+$//g; $_; } @newls; # Trim leading and trailing whitespace
	return @newls;
	}

	sub debug_print {
	my ($level, $message) = @_;
	say STDERR $message if $verbosity >= $level;
	}

	sub word_count {
	my $text = shift;
	my @words = split(/\s+/, $text);
	return scalar @words;
	}

	sub split_at_punctuation {
	my ($sentence, $max_chars) = @_;
	my @punctuations = (';', ',', ')', '—', '–', '-'); # Include UTF-8 dashes
	foreach my $punc (@punctuations) {
	if ($sentence =~ /(\Q$punc\E)/) {
	my $index = index($sentence, $punc) + 1;
	if ($index < $max_chars) {
	return (substr($sentence, 0, $index), substr($sentence, $index + 1));
	}
	}
	}
	return undef;
	}

	sub wrap_text {
	my ($text, $max_words, $max_chars) = @_;
	my @wrapped_lines;
	my $sentences = get_sentences($text);
	my $current_line = '';
	my $current_word_count = 0;

	debug_print(2, "$bgblu$yel Beginning text wrapping. $rst");
	debug_print(3, " {{$yel$text$rst}}");

	foreach my $sentence (@$sentences) {
	my $sentence_word_count = word_count($sentence);
	debug_print(3, " $whi$sentence_word_count <= $max_words && ", length($sentence), " <= $max_chars$rst");
	if ($current_word_count + $sentence_word_count <= $max_words && length($current_line . ' ' . $sentence) <= $max_chars) {
	debug_print(3, " ${bbla}Appending sentence within limits: ${rst}$sentence");
	$current_line .= ($current_line ? ' ' : '') . $sentence;
	$current_word_count += $sentence_word_count;
	} else {
	if ($current_line) {
	debug_print(3, " ${gre}FINAL Pushing current line: ${rst}$current_line");
	push @wrapped_lines, $current_line;
	}
	if ($sentence_word_count <= $max_words && length($sentence) <= $max_chars) {
	$current_line = $sentence;
	$current_word_count = $sentence_word_count;
	} else {
	debug_print(3, " ${bbla}Sentence exceeds limits, attempting to split.$rst");
	my ($first_part, $remaining) = split_at_punctuation($sentence, $max_chars);
	if (defined $first_part) {
	debug_print(3, " ${gre}FINAL Split at punctuation: ${rst}$first_part");
	push @wrapped_lines, $first_part;
	$current_line = $remaining // '';
	$current_word_count = word_count($current_line);
	} else {
	# Directly split long sentences exceeding both limits
	my $cut = substr($sentence, 0, $max_chars);
	my $breakpoint = rindex($cut, ' ');
	if ($breakpoint != -1) {
	$current_line = substr($sentence, 0, $breakpoint);
	$sentence = substr($sentence, $breakpoint + 1);
	} else {
	$current_line = substr($sentence, 0, $max_chars);
	$sentence = substr($sentence, $max_chars);
	}
	debug_print(3, " ${gre}FINAL Forced split: ${rst}$current_line");
	push @wrapped_lines, $current_line;
	$current_line = $sentence;
	$current_word_count = word_count($current_line);
	}
	}
	}
	}

	push @wrapped_lines, $current_line if $current_line ne '';
	debug_print(3, " ${gre}Final wrapped lines complete.$rst");
	for my $t (@wrapped_lines) {
	debug_print(3, " {{${bgre}$t$rst}}");
	}

	return @wrapped_lines;
	}



	sub process_input {
	my @lines = read_input();
	@lines = map { s/^\s+//; s/\s+$//; $_; } @lines;
	my @paragraphs = join_appropriate_lines(@lines);
	if ($opt_join) {
	@paragraphs = (join(' ', @paragraphs));
	}
	foreach my $paragraph (@paragraphs) {
	next if $paragraph =~ /^\s*$/;
	$paragraph =~ s/^\s+//s;
	$paragraph =~ s/\s+$//s;
	my @wrapped = wrap_text($paragraph, $max_word_count, $char_limit);
	foreach my $line (@wrapped) {
	print "$line\n";
	}
	# print "\n"; # Separate paragraphs
	}
	}

	process_input();