Last active
July 26, 2024 11:50
-
-
Save aadmathijssen/7e66ce81fe80d047ee560a6594c2ef59 to your computer and use it in GitHub Desktop.
Find exact and approximate duplicates of source translations.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
declare(strict_types=1); | |
namespace I18nDuplicateFinder; | |
use RuntimeException; | |
use SplFileObject; | |
use function array_key_exists; | |
use function fwrite; | |
use function iconv; | |
use function implode; | |
use function in_array; | |
use function is_array; | |
use function is_string; | |
use function preg_replace; | |
use function sprintf; | |
use function strtolower; | |
use function substr_replace; | |
use function trim; | |
use const PHP_EOL; | |
use const STDERR; | |
use const STDOUT; | |
class Application | |
{ | |
/** @throws RuntimeException */ | |
public function run(string $newTranslationsFilePath, string $referenceTranslationsFilePath): void | |
{ | |
$referenceSources = []; | |
foreach ($this->getCsvFileReader($referenceTranslationsFilePath) as $referenceLine) { | |
if (is_array($referenceLine) && array_key_exists(0, $referenceLine) && is_string($referenceLine[0])) { | |
$referenceSourceKey = $this->slugify($referenceLine[0]); | |
if ($referenceSourceKey === null) { | |
continue; | |
} | |
if (!array_key_exists($referenceSourceKey, $referenceSources)) { | |
$referenceSources[$referenceSourceKey] = []; | |
} | |
if (!in_array($referenceLine[0], $referenceSources[$referenceSourceKey], true)) { | |
$referenceSources[$referenceSourceKey][] = $referenceLine[0]; | |
} | |
} | |
} | |
foreach ($this->getCsvFileReader($newTranslationsFilePath) as $newLine) { | |
if (is_array($newLine) && array_key_exists(0, $newLine) && is_string($newLine[0])) { | |
$newSourceKey = $this->slugify($newLine[0]); | |
if ($newSourceKey !== null && array_key_exists($newSourceKey, $referenceSources)) { | |
$this->printDuplicate($newLine[0], $referenceSources[$newSourceKey]); | |
} | |
} | |
} | |
} | |
/** @throws RuntimeException */ | |
private function getCsvFileReader(string $filePath): SplFileObject | |
{ | |
$splFileObject = new SplFileObject($filePath); | |
$splFileObject->setFlags(SplFileObject::READ_CSV); | |
return $splFileObject; | |
} | |
private function slugify(string $value): ?string | |
{ | |
$result = preg_replace('~[^\pL\d]+~u', '-', $value); | |
if (!is_string($result)) { | |
return null; | |
} | |
$result = iconv('utf-8', 'us-ascii//TRANSLIT', $result); | |
if ($result === false) { | |
return null; | |
} | |
$result = preg_replace('~[^-\w]+~', '', $result); | |
if (!is_string($result)) { | |
return null; | |
} | |
$result = trim($result, '-'); | |
$result = preg_replace('~-+~', '-', $result); | |
if (!is_string($result)) { | |
return null; | |
} | |
$result = strtolower($result); | |
if ($result === '') { | |
return null; | |
} | |
return $result; | |
} | |
/** @param array<string> $existingSources */ | |
private function printDuplicate(string $newSource, array $existingSources): void | |
{ | |
if (in_array($newSource, $existingSources, true)) { | |
fwrite(STDOUT, $newSource . PHP_EOL); | |
} else { | |
$approximateDuplicatesOutput = implode(', ', substr_replace($existingSources, '≈ ', 0, 0)); | |
fwrite(STDOUT, sprintf('%s (%s)', $newSource, $approximateDuplicatesOutput) . PHP_EOL); | |
} | |
} | |
} | |
if ($argc !== 3) { | |
fwrite(STDERR, sprintf(<<<'HELP_TEXT' | |
Usage: php %s NEW_TRANSLATIONS_FILE REFERENCE_TRANSLATIONS_FILE | |
Find exact and approximate duplicates of source translations from NEW_TRANSLATIONS_FILE in REFERENCE_TRANSLATIONS_FILE. | |
Assumes files are in CSV format and the first column contains source translations. | |
HELP_TEXT, $argv[0])); | |
exit(1); | |
} | |
(new Application())->run($argv[1], $argv[2]); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment