Created
May 1, 2025 21:13
-
-
Save MasterDuke17/d8403e036ab778bcaa47310dcdd89b56 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use v6; | |
use nqp; | |
unit module Text::Diff::Sift4; | |
sub sift4(Str:D() $s1, Str:D() $s2, Int $maxOffset = 100, Int $maxDistance = 100 --> Int) is export { | |
my int $l1 = nqp::chars($s1); | |
my int $l2 = nqp::chars($s2); | |
return $l2 unless $l1; | |
return $l1 unless $l2; | |
my int $c1; | |
my int $c2; | |
my int $lcss; | |
my int $local_cs; | |
my int $trans; | |
my int $max_offset = $maxOffset; | |
my int $max_distance = $maxDistance; | |
my $offset_arr := nqp::list_i; | |
my int $isTrans; | |
my int $i; | |
while nqp::islt_i($c1, $l1) && nqp::islt_i($c2, $l2) { | |
if nqp::iseq_i(nqp::ordat($s1, $c1), nqp::ordat($s2, $c2)) { | |
nqp::stmts( | |
($isTrans = 0), | |
($i = 0), | |
($local_cs = nqp::add_i($local_cs, 1)), | |
nqp::while(nqp::islt_i($i, nqp::elems($offset_arr)), | |
nqp::stmts( | |
(my int $_trans = nqp::atpos_i($offset_arr, $i)), | |
(my int $_c1 = nqp::atpos_i($offset_arr, nqp::add_i($i, 1))), | |
(my int $_c2 = nqp::atpos_i($offset_arr, nqp::add_i($i, 2))), | |
nqp::if(nqp::isle_i($c1, $_c1) || nqp::isle_i($c2, $_c2), | |
nqp::stmts( | |
($isTrans = nqp::isge_i(nqp::abs_i(nqp::sub_i($c2, $c1)), nqp::abs_i(nqp::sub_i($_c2, $_c1)))), | |
nqp::if($isTrans, | |
($trans = nqp::add_i($trans, 1)), | |
nqp::unless($_trans, | |
nqp::stmts(nqp::bindpos_i($offset_arr, $i, 1), | |
($trans = nqp::add_i($trans, 1))))), | |
($i = 2147483647)), | |
nqp::if((nqp::isgt_i($c1, $_c2) && nqp::isgt_i($c2, $_c1)), | |
nqp::splice($offset_arr, nqp::list, $i, 3), | |
($i = nqp::add_i($i, 3)))))), | |
nqp::push_i($offset_arr, $isTrans), | |
nqp::push_i($offset_arr, $c1), | |
nqp::push_i($offset_arr, $c2)) | |
} else { | |
nqp::stmts( | |
($lcss = nqp::add_i($lcss, $local_cs)), | |
($local_cs = 0), | |
nqp::if(nqp::isne_i($c1, $c2), ($c1 = $c2 = nqp::isle_i($c1, $c2) ?? $c1 !! $c2)), | |
($i = 0), | |
nqp::while( | |
nqp::islt_i($i, $max_offset) && (nqp::islt_i(nqp::add_i($c1, $i), $l1) || | |
nqp::islt_i(nqp::add_i($c2, $i), $l2)), | |
nqp::stmts( | |
nqp::if(nqp::islt_i(nqp::add_i($c1, $i), $l1) && | |
nqp::iseq_i(nqp::ordat($s1, nqp::add_i($c1, $i)), | |
nqp::ordat($s2, $c2)), | |
nqp::stmts( | |
($c1 = nqp::sub_i(nqp::add_i($c1, $i), 1)), | |
($c2 = nqp::sub_i($c2, 1)), | |
($i = 2147483647))), | |
nqp::if(nqp::islt_i(nqp::add_i($c2, $i), $l2) && | |
nqp::iseq_i(nqp::ordat($s1, $c1), nqp::ordat($s2, nqp::add_i($c2, $i))), | |
nqp::stmts( | |
($c2 = nqp::sub_i(nqp::add_i($c2, $i), 1)), | |
($c1 = nqp::sub_i($c1, 1)), | |
($i = 2147483647))), | |
($i = nqp::add_i($i, 1))))) | |
} | |
$c1 = nqp::add_i($c1, 1); | |
$c2 = nqp::add_i($c2, 1); | |
nqp::if(nqp::isge_i( nqp::add_i(nqp::sub_i((nqp::isge_i($c1, $c2) ?? $c1 !! $c2), $lcss), $trans), $max_distance), | |
nqp::if($max_distance, return nqp::add_i(nqp::sub_i((nqp::isge_i($c1, $c2) ?? $c1 !! $c2), $lcss), $trans))); | |
nqp::if(nqp::isge_i($c1, $l1) || nqp::isge_i($c2, $l2), | |
nqp::stmts( | |
($lcss = nqp::add_i($lcss, $local_cs)), | |
($local_cs = 0), | |
($c1 = $c2 = (nqp::isle_i($c1, $c2) ?? $c1 !! $c2))) | |
); | |
} | |
$lcss = nqp::add_i($lcss, $local_cs); | |
nqp::add_i(nqp::sub_i((nqp::isge_i($l1, $l2) ?? $l1 !! $l2), $lcss), $trans) | |
} | |
# vim: ft=perl6 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use v6; | |
use nqp; | |
unit module Text::Diff::Sift4; | |
class Offset { | |
has int $.c1 is built(:bind); | |
has int $.c2 is built(:bind); | |
has Bool $.trans is rw is built(:bind); | |
} | |
sub sift4(str $s1, str $s2, int $maxOffset = 100, int $maxDistance = 100 --> int) is export { | |
my int $l1 = chars($s1); | |
my int $l2 = chars($s2); | |
return $l2 unless $l1; | |
return $l1 unless $l2; | |
my int $c1; | |
my int $c2; | |
my int $lcss; | |
my int $local_cs; | |
my int $trans; | |
my @offset_arr; | |
my int $i; | |
my int $tempDistance; | |
my int $ofs-c1; | |
my int $ofs-c2; | |
my Offset $ofs; | |
while $c1 < $l1 and $c2 < $l2 { | |
if nqp::ordat($s1, $c1) == nqp::ordat($s2, $c2) { | |
++$local_cs; | |
my Bool $isTrans = False; | |
$i = 0; | |
while $i < @offset_arr.elems { | |
$ofs := @offset_arr[$i]; | |
$ofs-c1 = $ofs.c1; | |
$ofs-c2 = $ofs.c2; | |
if $c1 <= $ofs-c1 or $c2 <= $ofs-c2 { | |
$isTrans = abs($c2 - $c1) >= abs($ofs-c2 - $ofs-c1); | |
if $isTrans { | |
++$trans; | |
} else { | |
if !$ofs.trans { | |
$ofs.trans = True; | |
++$trans; | |
} | |
} | |
last; | |
} else { | |
if $c1 > $ofs-c2 and $c2 > $ofs-c1 { | |
@offset_arr.splice($i, 1); | |
} else { | |
++$i; | |
} | |
} | |
} | |
@offset_arr.push(Offset.new(:$c1, :$c2, trans => $isTrans)); | |
} else { | |
$lcss += $local_cs; | |
$local_cs = 0; | |
if $c1 != $c2 { | |
$c1 = $c2 = ($c1 min $c2); | |
} | |
loop ($i = 0; $i < $maxOffset and ($c1 + $i < $l1 or $c2 + $i < $l2); ++$i) { | |
if ($c1 + $i < $l1) and nqp::ordat($s1, $c1 + $i) == nqp::ordat($s2, $c2) { | |
$c1 += $i - 1; | |
--$c2; | |
last; | |
} | |
if ($c2 + $i < $l2) and nqp::ordat($s1, $c1) == nqp::ordat($s2, $c2 + $i) { | |
$c2 += $i - 1; | |
--$c1; | |
last; | |
} | |
} | |
} | |
++$c1; | |
++$c2; | |
if $maxDistance { | |
$tempDistance = ($c1 max $c2) - $lcss + $trans; | |
return $tempDistance if $tempDistance >= $maxDistance; | |
} | |
if $c1 >= $l1 or $c2 >= $l2 { | |
$lcss += $local_cs; | |
$local_cs = 0; | |
$c1 = $c2 = ($c1 min $c2); | |
} | |
} | |
$lcss += $local_cs; | |
return ($l1 max $l2) - $lcss + $trans; | |
} | |
# vim: ft=perl6 |
Are the
return
s in line 16/17 necessary?Lose the final
return
Algorithmically speaking: would it make sense to split the cases of first string larger / smaller than second string into separate parts, so you'd only need to check on boundary?
Why is $IsTrans a Bool and not an int?
- not sure yet, i'll check
- right
- it's been about 9 years since i actually looked at the algorithm, but i'll think about it (or pass the question on to the original author)
- because that's how it was in the original JavaScript i ported from, but yeah, changing it (and the
$.trans
) is slightly faster
3. Algorithmically speaking: would it make sense to split the cases of first string larger / smaller than second string into separate parts, so you'd only need to check on boundary?
Hm. I haven't gotten into the details of the algorithm yet, so I'm not actually sure what I'm about to say makes sense, but if you split them up maybe it wouldn't correctly track a character moving across that boundary?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Why is
$IsTrans
aBool
and not anint
?