Created
May 1, 2025 21:13
-
-
Save MasterDuke17/d8403e036ab778bcaa47310dcdd89b56 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use v6; | |
use nqp; | |
unit module Text::Diff::Sift4; | |
sub sift4(Str:D() $s1, Str:D() $s2, Int $maxOffset = 100, Int $maxDistance = 100 --> Int) is export { | |
my int $l1 = nqp::chars($s1); | |
my int $l2 = nqp::chars($s2); | |
return $l2 unless $l1; | |
return $l1 unless $l2; | |
my int $c1; | |
my int $c2; | |
my int $lcss; | |
my int $local_cs; | |
my int $trans; | |
my int $max_offset = $maxOffset; | |
my int $max_distance = $maxDistance; | |
my $offset_arr := nqp::list_i; | |
my int $isTrans; | |
my int $i; | |
while nqp::islt_i($c1, $l1) && nqp::islt_i($c2, $l2) { | |
if nqp::iseq_i(nqp::ordat($s1, $c1), nqp::ordat($s2, $c2)) { | |
nqp::stmts( | |
($isTrans = 0), | |
($i = 0), | |
($local_cs = nqp::add_i($local_cs, 1)), | |
nqp::while(nqp::islt_i($i, nqp::elems($offset_arr)), | |
nqp::stmts( | |
(my int $_trans = nqp::atpos_i($offset_arr, $i)), | |
(my int $_c1 = nqp::atpos_i($offset_arr, nqp::add_i($i, 1))), | |
(my int $_c2 = nqp::atpos_i($offset_arr, nqp::add_i($i, 2))), | |
nqp::if(nqp::isle_i($c1, $_c1) || nqp::isle_i($c2, $_c2), | |
nqp::stmts( | |
($isTrans = nqp::isge_i(nqp::abs_i(nqp::sub_i($c2, $c1)), nqp::abs_i(nqp::sub_i($_c2, $_c1)))), | |
nqp::if($isTrans, | |
($trans = nqp::add_i($trans, 1)), | |
nqp::unless($_trans, | |
nqp::stmts(nqp::bindpos_i($offset_arr, $i, 1), | |
($trans = nqp::add_i($trans, 1))))), | |
($i = 2147483647)), | |
nqp::if((nqp::isgt_i($c1, $_c2) && nqp::isgt_i($c2, $_c1)), | |
nqp::splice($offset_arr, nqp::list, $i, 3), | |
($i = nqp::add_i($i, 3)))))), | |
nqp::push_i($offset_arr, $isTrans), | |
nqp::push_i($offset_arr, $c1), | |
nqp::push_i($offset_arr, $c2)) | |
} else { | |
nqp::stmts( | |
($lcss = nqp::add_i($lcss, $local_cs)), | |
($local_cs = 0), | |
nqp::if(nqp::isne_i($c1, $c2), ($c1 = $c2 = nqp::isle_i($c1, $c2) ?? $c1 !! $c2)), | |
($i = 0), | |
nqp::while( | |
nqp::islt_i($i, $max_offset) && (nqp::islt_i(nqp::add_i($c1, $i), $l1) || | |
nqp::islt_i(nqp::add_i($c2, $i), $l2)), | |
nqp::stmts( | |
nqp::if(nqp::islt_i(nqp::add_i($c1, $i), $l1) && | |
nqp::iseq_i(nqp::ordat($s1, nqp::add_i($c1, $i)), | |
nqp::ordat($s2, $c2)), | |
nqp::stmts( | |
($c1 = nqp::sub_i(nqp::add_i($c1, $i), 1)), | |
($c2 = nqp::sub_i($c2, 1)), | |
($i = 2147483647))), | |
nqp::if(nqp::islt_i(nqp::add_i($c2, $i), $l2) && | |
nqp::iseq_i(nqp::ordat($s1, $c1), nqp::ordat($s2, nqp::add_i($c2, $i))), | |
nqp::stmts( | |
($c2 = nqp::sub_i(nqp::add_i($c2, $i), 1)), | |
($c1 = nqp::sub_i($c1, 1)), | |
($i = 2147483647))), | |
($i = nqp::add_i($i, 1))))) | |
} | |
$c1 = nqp::add_i($c1, 1); | |
$c2 = nqp::add_i($c2, 1); | |
nqp::if(nqp::isge_i( nqp::add_i(nqp::sub_i((nqp::isge_i($c1, $c2) ?? $c1 !! $c2), $lcss), $trans), $max_distance), | |
nqp::if($max_distance, return nqp::add_i(nqp::sub_i((nqp::isge_i($c1, $c2) ?? $c1 !! $c2), $lcss), $trans))); | |
nqp::if(nqp::isge_i($c1, $l1) || nqp::isge_i($c2, $l2), | |
nqp::stmts( | |
($lcss = nqp::add_i($lcss, $local_cs)), | |
($local_cs = 0), | |
($c1 = $c2 = (nqp::isle_i($c1, $c2) ?? $c1 !! $c2))) | |
); | |
} | |
$lcss = nqp::add_i($lcss, $local_cs); | |
nqp::add_i(nqp::sub_i((nqp::isge_i($l1, $l2) ?? $l1 !! $l2), $lcss), $trans) | |
} | |
# vim: ft=perl6 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
use v6; | |
use nqp; | |
unit module Text::Diff::Sift4; | |
class Offset { | |
has int $.c1 is built(:bind); | |
has int $.c2 is built(:bind); | |
has Bool $.trans is rw is built(:bind); | |
} | |
sub sift4(str $s1, str $s2, int $maxOffset = 100, int $maxDistance = 100 --> int) is export { | |
my int $l1 = chars($s1); | |
my int $l2 = chars($s2); | |
return $l2 unless $l1; | |
return $l1 unless $l2; | |
my int $c1; | |
my int $c2; | |
my int $lcss; | |
my int $local_cs; | |
my int $trans; | |
my @offset_arr; | |
my int $i; | |
my int $tempDistance; | |
my int $ofs-c1; | |
my int $ofs-c2; | |
my Offset $ofs; | |
while $c1 < $l1 and $c2 < $l2 { | |
if nqp::ordat($s1, $c1) == nqp::ordat($s2, $c2) { | |
++$local_cs; | |
my Bool $isTrans = False; | |
$i = 0; | |
while $i < @offset_arr.elems { | |
$ofs := @offset_arr[$i]; | |
$ofs-c1 = $ofs.c1; | |
$ofs-c2 = $ofs.c2; | |
if $c1 <= $ofs-c1 or $c2 <= $ofs-c2 { | |
$isTrans = abs($c2 - $c1) >= abs($ofs-c2 - $ofs-c1); | |
if $isTrans { | |
++$trans; | |
} else { | |
if !$ofs.trans { | |
$ofs.trans = True; | |
++$trans; | |
} | |
} | |
last; | |
} else { | |
if $c1 > $ofs-c2 and $c2 > $ofs-c1 { | |
@offset_arr.splice($i, 1); | |
} else { | |
++$i; | |
} | |
} | |
} | |
@offset_arr.push(Offset.new(:$c1, :$c2, trans => $isTrans)); | |
} else { | |
$lcss += $local_cs; | |
$local_cs = 0; | |
if $c1 != $c2 { | |
$c1 = $c2 = ($c1 min $c2); | |
} | |
loop ($i = 0; $i < $maxOffset and ($c1 + $i < $l1 or $c2 + $i < $l2); ++$i) { | |
if ($c1 + $i < $l1) and nqp::ordat($s1, $c1 + $i) == nqp::ordat($s2, $c2) { | |
$c1 += $i - 1; | |
--$c2; | |
last; | |
} | |
if ($c2 + $i < $l2) and nqp::ordat($s1, $c1) == nqp::ordat($s2, $c2 + $i) { | |
$c2 += $i - 1; | |
--$c1; | |
last; | |
} | |
} | |
} | |
++$c1; | |
++$c2; | |
if $maxDistance { | |
$tempDistance = ($c1 max $c2) - $lcss + $trans; | |
return $tempDistance if $tempDistance >= $maxDistance; | |
} | |
if $c1 >= $l1 or $c2 >= $l2 { | |
$lcss += $local_cs; | |
$local_cs = 0; | |
$c1 = $c2 = ($c1 min $c2); | |
} | |
} | |
$lcss += $local_cs; | |
return ($l1 max $l2) - $lcss + $trans; | |
} | |
# vim: ft=perl6 |
3. Algorithmically speaking: would it make sense to split the cases of first string larger / smaller than second string into separate parts, so you'd only need to check on boundary?
Hm. I haven't gotten into the details of the algorithm yet, so I'm not actually sure what I'm about to say makes sense, but if you split them up maybe it wouldn't correctly track a character moving across that boundary?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
$.trans
) is slightly faster