Created
December 21, 2019 21:56
-
-
Save bpj/2bd0b044cf7e31b33b26ea9c806c1607 to your computer and use it in GitHub Desktop.
Emulate variable length lookbehind in Perl regexes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
#=============================================================================== | |
# Perl used to not support variable-length lookbehind (VLLB) in | |
# regexes. Version 5.30 supports *limited length* | |
# VLLB, but this trick emulates VLLB in older versions too, | |
# and apparently without any arbitrary | |
# length limit! | |
# | |
# See <http://www.drregex.com/2019/02/variable-length-lookbehinds-actually.html?m=1> | |
# | |
# My problem is that the Greek lower case letter "σ" (sigma) | |
# has a special form "ς" at the end of words, while it has a | |
# single form "Σ" in uppercase, so when you change case or | |
# transliterate from Latin script you need to substitute the | |
# final form. | |
# | |
# Unfortunately `s/σ\b/ς/g` doesn't do the trick: | |
# | |
# * When "σ" is followed by an apostrophe it should remain "σ". | |
# Fortunately no one in their right mind uses English style | |
# single quotes in Greek text (you use "«...»" as outer quotes | |
# and "‹...›" or “...” as inner quotes) so all sigmas before | |
# an apostrophe should remain non-final. | |
# | |
# * I want to replace non-final sigma by final sigma only when | |
# it is directly preceded by another Greek letter. Isolated | |
# letters and letters in formulae and the like should be left | |
# as is! And here is the rub: in decomposed text — with | |
# combining diacritical marks — any Greek vowel may be | |
# followed by zero or more diacritical marks, which used to | |
# ruin the party because you need to match a sigma preceded by | |
# any Greek letter *and optional diacritical marks*! | |
# Sure you can go `s{ ( (?= \pL ) \p{Greek} \p{Mn}* ) σ ... }{$1ς}gx` but that isn't very elegant, now is it? It's probably | |
# a lot faster though! :-) | |
# | |
#=============================================================================== | |
use utf8; | |
# use utf8::all; | |
use autodie 2.26; | |
use 5.014; | |
use strict; | |
use warnings; | |
use warnings qw(FATAL utf8); | |
use open qw(:std :utf8); | |
use Scalar::Util qw[reftype]; | |
use Carp; | |
# Subroutine to wrap a regex object in the (really hairy!) VLLB idiom | |
sub vllb { | |
my($X) = @_; | |
# Check that we got a proper regex | |
'REGEXP' eq( reftype($X) // "" ) # / | |
or croak "Not a Regexp object"; | |
return qr{ | |
(?=(?<a>[\s\S]*)) # Capture the rest of the string in "a" | |
(?<b> | |
$X | |
(?=\k<a>\z) # Match X followed by the contents of "a" to ensure | |
# the emulated lookbehind stops at the correct point. | |
| # OR | |
(?<= # Look behind (one character) match either: | |
(?= | |
x^ # A contradiction; non-empty to appease the nanny | |
| # OR | |
(?&b) # Recurse (match X OR look behind (one character)) etc.. | |
) | |
[\s\S] # How far we go back each step: one single character | |
) | |
) | |
}x; | |
} | |
# Match a Greek letter followed by zero or more combining marks | |
my $greek_letter = vllb qr{ (?=\pL) \p{Greek} \p{Mn}* }x; | |
# Match a non-final sigma preceded by a Greek letter | |
# and not followed by an apostrophe | |
my $final_sigma = qr{ $greek_letter σ \b (?! [\'’᾽] ) }x; | |
# The first few lines of the Odyssey, | |
# decomposed and without any final sigmas | |
# plus a few examples where conversion shouldn't happen | |
my $text = <<'EOT'; | |
ἄνδρα μοι ἔννεπε, μοῦσα, πολύτροπον, ὃσ μάλα πολλὰ | |
πλάγχθη, ἐπεὶ Τροίησ ἱερὸν πτολίεθρον ἔπερσεν: | |
πολλῶν δ᾽ ἀνθρώπων ἴδεν ἄστεα καὶ νόον ἔγνω, | |
πολλὰ δ᾽ ὅ γ᾽ ἐν πόντῳ πάθεν ἄλγεα ὃν κατὰ θυμόν, | |
ἀρνύμενοσ ἥν τε ψυχὴν καὶ νόστον ἑταίρων. | |
ἀλλ᾽ οὐδ᾽ ὣσ ἑτάρουσ ἐρρύσατο, ἱέμενόσ περ: | |
αὐτῶν γὰρ σφετέρῃσιν ἀτασθαλίῃσιν ὄλοντο, | |
νήπιοι, οἳ κατὰ βοῦσ Ὑπερίονοσ Ἠελίοιο | |
ἤσθιον: αὐτὰρ ὁ τοῖσιν ἀφείλετο νόστιμον ἦμαρ. | |
τῶν ἁμόθεν γε, θεά, θύγατερ Διόσ, εἰπὲ καὶ ἡμῖν. | |
σ | |
ασ’ασ | |
aσ | |
EOT | |
# The easy way | |
say $text =~ s/$final_sigma/ς/gr; | |
# And here is just to show that the trick works with the `\G` anchor! | |
my $len = length $text; | |
pos($text) = 0; | |
while ( pos($text) < $len ) { | |
# Either any number of non-sigmas | |
if ( $text =~ m/\G([^σ]+)/gcs ) { | |
print $1 | |
} | |
# or a final sigma | |
elsif ( $text =~ m/\G$final_sigma/gc ) { | |
print "ς"; | |
} | |
# or a non-final sigma | |
elsif ( $text =~ m/\Gσ/gcs ) { | |
print "σ"; | |
} | |
# Break out of the loop if we didn't match anything! | |
else { | |
last; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment