Skip to content

Instantly share code, notes, and snippets.

@ohader
Created September 10, 2018 11:50
Show Gist options
  • Save ohader/e5690799e31f846230fb773d9f04bae4 to your computer and use it in GitHub Desktop.
Save ohader/e5690799e31f846230fb773d9f04bae4 to your computer and use it in GitHub Desktop.
Duplicate file reducer. Searches for files having duplicate content and clones them to a dedicated directory.
<?php
declare(strict_types = 1);
namespace OliverHader\IndependentUtility;
/**
* Duplicate file reducer. Searches for files having duplicate content and
* clones them to a dedicated directory.
*
* @author Oliver Hader <[email protected]>
* @license MIT
*/
class DuplicateReducer
{
private const IGNORE_PATTERNS = [
'/cli-config.php$'
];
/**
* @var string
*/
private $targetDirectory;
/**
* @var array
*/
private $duplicates = [];
public function __construct(string $targetDirectory)
{
$this->targetDirectory = rtrim($targetDirectory, '/') . '/';
}
public function retrieve()
{
echo '+ Calculating hash sums...' . PHP_EOL;
exec(
'find . -type d -a -path \'./*/vendor\' -prune -o -type f -a -name \'*.php\' -a -print -exec md5sum {} \;',
$output
);
$duplicates = [];
foreach ($output as $line) {
if (preg_match('#^(?P<hash>[a-z0-9]+)\s+(?P<file>.+)#i', $line, $matches)) {
$hash = $matches['hash'];
$file = $matches['file'];
$duplicates[$hash][] = $file;
}
}
echo '+ Filtering duplicates...' . PHP_EOL;
$this->duplicates = array_filter(
$duplicates,
function (array $files) {
return count($files) > 1 && !$this->someMatches(
static::IGNORE_PATTERNS,
$files[0]
);
}
);
}
public function persist()
{
echo '+ Persisting in file system...' . PHP_EOL;
foreach ($this->duplicates as $files) {
$commonName = $this->getCommonName(...$files);
if ($commonName === null) {
continue;
}
$target = $this->targetDirectory . $commonName;
$targetDirectory = dirname($target);
if (!is_dir($targetDirectory)) {
mkdir($targetDirectory, 0755, true);
}
copy($files[0], $target);
}
}
/**
* @param string ...$files
* @return null|string
*/
private function getCommonName(string ...$files): ?string
{
$files = array_map('strrev', $files);
$first = array_shift($files);
$count = count($files);
$length = 0;
do {
$length++;
$matches = 0;
foreach ($files as $file) {
if (substr_compare($first, $file, 0, $length) === 0) {
$matches++;
}
}
} while($matches === $count);
if ($length === 1) {
return null;
}
$result = strrev(substr($first, 0, $length));
$firstSlash = strpos($result, '/');
if ($firstSlash === 0 || $firstSlash === false) {
return trim($result, '/');
}
return substr($result, $firstSlash + 1);
}
/**
* @param array $patterns
* @param string $subject
* @return bool
*/
private function someMatches(array $patterns, string $subject): bool
{
foreach ($patterns as $pattern) {
if (preg_match('#' . $pattern . '#', $subject)) {
return true;
}
}
return false;
}
}
call_user_func(function() {
$reducer = new DuplicateReducer('reduced/');
$reducer->retrieve();
$reducer->persist();
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment