Created
September 10, 2018 11:50
-
-
Save ohader/e5690799e31f846230fb773d9f04bae4 to your computer and use it in GitHub Desktop.
Duplicate file reducer. Searches for files having duplicate content and clones them to a dedicated directory.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
declare(strict_types = 1); | |
namespace OliverHader\IndependentUtility; | |
/** | |
* Duplicate file reducer. Searches for files having duplicate content and | |
* clones them to a dedicated directory. | |
* | |
* @author Oliver Hader <[email protected]> | |
* @license MIT | |
*/ | |
class DuplicateReducer | |
{ | |
private const IGNORE_PATTERNS = [ | |
'/cli-config.php$' | |
]; | |
/** | |
* @var string | |
*/ | |
private $targetDirectory; | |
/** | |
* @var array | |
*/ | |
private $duplicates = []; | |
public function __construct(string $targetDirectory) | |
{ | |
$this->targetDirectory = rtrim($targetDirectory, '/') . '/'; | |
} | |
public function retrieve() | |
{ | |
echo '+ Calculating hash sums...' . PHP_EOL; | |
exec( | |
'find . -type d -a -path \'./*/vendor\' -prune -o -type f -a -name \'*.php\' -a -print -exec md5sum {} \;', | |
$output | |
); | |
$duplicates = []; | |
foreach ($output as $line) { | |
if (preg_match('#^(?P<hash>[a-z0-9]+)\s+(?P<file>.+)#i', $line, $matches)) { | |
$hash = $matches['hash']; | |
$file = $matches['file']; | |
$duplicates[$hash][] = $file; | |
} | |
} | |
echo '+ Filtering duplicates...' . PHP_EOL; | |
$this->duplicates = array_filter( | |
$duplicates, | |
function (array $files) { | |
return count($files) > 1 && !$this->someMatches( | |
static::IGNORE_PATTERNS, | |
$files[0] | |
); | |
} | |
); | |
} | |
public function persist() | |
{ | |
echo '+ Persisting in file system...' . PHP_EOL; | |
foreach ($this->duplicates as $files) { | |
$commonName = $this->getCommonName(...$files); | |
if ($commonName === null) { | |
continue; | |
} | |
$target = $this->targetDirectory . $commonName; | |
$targetDirectory = dirname($target); | |
if (!is_dir($targetDirectory)) { | |
mkdir($targetDirectory, 0755, true); | |
} | |
copy($files[0], $target); | |
} | |
} | |
/** | |
* @param string ...$files | |
* @return null|string | |
*/ | |
private function getCommonName(string ...$files): ?string | |
{ | |
$files = array_map('strrev', $files); | |
$first = array_shift($files); | |
$count = count($files); | |
$length = 0; | |
do { | |
$length++; | |
$matches = 0; | |
foreach ($files as $file) { | |
if (substr_compare($first, $file, 0, $length) === 0) { | |
$matches++; | |
} | |
} | |
} while($matches === $count); | |
if ($length === 1) { | |
return null; | |
} | |
$result = strrev(substr($first, 0, $length)); | |
$firstSlash = strpos($result, '/'); | |
if ($firstSlash === 0 || $firstSlash === false) { | |
return trim($result, '/'); | |
} | |
return substr($result, $firstSlash + 1); | |
} | |
/** | |
* @param array $patterns | |
* @param string $subject | |
* @return bool | |
*/ | |
private function someMatches(array $patterns, string $subject): bool | |
{ | |
foreach ($patterns as $pattern) { | |
if (preg_match('#' . $pattern . '#', $subject)) { | |
return true; | |
} | |
} | |
return false; | |
} | |
} | |
call_user_func(function() { | |
$reducer = new DuplicateReducer('reduced/'); | |
$reducer->retrieve(); | |
$reducer->persist(); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment