Skip to content

Instantly share code, notes, and snippets.

@aadmathijssen
Last active July 26, 2024 11:50
Show Gist options
  • Save aadmathijssen/7e66ce81fe80d047ee560a6594c2ef59 to your computer and use it in GitHub Desktop.
Save aadmathijssen/7e66ce81fe80d047ee560a6594c2ef59 to your computer and use it in GitHub Desktop.
Find exact and approximate duplicates of source translations.
<?php
declare(strict_types=1);
namespace I18nDuplicateFinder;
use RuntimeException;
use SplFileObject;
use function array_key_exists;
use function fwrite;
use function iconv;
use function implode;
use function in_array;
use function is_array;
use function is_string;
use function preg_replace;
use function sprintf;
use function strtolower;
use function substr_replace;
use function trim;
use const PHP_EOL;
use const STDERR;
use const STDOUT;
class Application
{
/** @throws RuntimeException */
public function run(string $newTranslationsFilePath, string $referenceTranslationsFilePath): void
{
$referenceSources = [];
foreach ($this->getCsvFileReader($referenceTranslationsFilePath) as $referenceLine) {
if (is_array($referenceLine) && array_key_exists(0, $referenceLine) && is_string($referenceLine[0])) {
$referenceSourceKey = $this->slugify($referenceLine[0]);
if ($referenceSourceKey === null) {
continue;
}
if (!array_key_exists($referenceSourceKey, $referenceSources)) {
$referenceSources[$referenceSourceKey] = [];
}
if (!in_array($referenceLine[0], $referenceSources[$referenceSourceKey], true)) {
$referenceSources[$referenceSourceKey][] = $referenceLine[0];
}
}
}
foreach ($this->getCsvFileReader($newTranslationsFilePath) as $newLine) {
if (is_array($newLine) && array_key_exists(0, $newLine) && is_string($newLine[0])) {
$newSourceKey = $this->slugify($newLine[0]);
if ($newSourceKey !== null && array_key_exists($newSourceKey, $referenceSources)) {
$this->printDuplicate($newLine[0], $referenceSources[$newSourceKey]);
}
}
}
}
/** @throws RuntimeException */
private function getCsvFileReader(string $filePath): SplFileObject
{
$splFileObject = new SplFileObject($filePath);
$splFileObject->setFlags(SplFileObject::READ_CSV);
return $splFileObject;
}
private function slugify(string $value): ?string
{
$result = preg_replace('~[^\pL\d]+~u', '-', $value);
if (!is_string($result)) {
return null;
}
$result = iconv('utf-8', 'us-ascii//TRANSLIT', $result);
if ($result === false) {
return null;
}
$result = preg_replace('~[^-\w]+~', '', $result);
if (!is_string($result)) {
return null;
}
$result = trim($result, '-');
$result = preg_replace('~-+~', '-', $result);
if (!is_string($result)) {
return null;
}
$result = strtolower($result);
if ($result === '') {
return null;
}
return $result;
}
/** @param array<string> $existingSources */
private function printDuplicate(string $newSource, array $existingSources): void
{
if (in_array($newSource, $existingSources, true)) {
fwrite(STDOUT, $newSource . PHP_EOL);
} else {
$approximateDuplicatesOutput = implode(', ', substr_replace($existingSources, '≈ ', 0, 0));
fwrite(STDOUT, sprintf('%s (%s)', $newSource, $approximateDuplicatesOutput) . PHP_EOL);
}
}
}
if ($argc !== 3) {
fwrite(STDERR, sprintf(<<<'HELP_TEXT'
Usage: php %s NEW_TRANSLATIONS_FILE REFERENCE_TRANSLATIONS_FILE
Find exact and approximate duplicates of source translations from NEW_TRANSLATIONS_FILE in REFERENCE_TRANSLATIONS_FILE.
Assumes files are in CSV format and the first column contains source translations.
HELP_TEXT, $argv[0]));
exit(1);
}
(new Application())->run($argv[1], $argv[2]);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment