Skip to content

Instantly share code, notes, and snippets.

@polonskiy
Created January 27, 2014 16:00
Show Gist options
  • Save polonskiy/8651225 to your computer and use it in GitHub Desktop.
Save polonskiy/8651225 to your computer and use it in GitHub Desktop.
Near-duplicate finder. Shingling
<?php
function similar_text_shingles($texts, $shingle_len = 10, $algo = 'md5') {
$c = count($texts);
$total = 0;
for ($i = 0; $i < $c; $i++) {
preg_match_all('#\w{4,}#u', mb_strtolower($texts[$i], 'utf-8'), $matches);
$words = $matches[0];
$cc = count($words) - $shingle_len;
for ($j = 0; $j < $cc; $j++) {
$shingle = implode(' ', array_slice($words, $j, $shingle_len));
$hashes[$i][] = hash($algo, $shingle);
}
$total += $cc;
}
$intersect = count(call_user_func_array('array_intersect', $hashes));
return $intersect * 2 / $total * 100;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment