Last active
June 15, 2024 17:00
-
-
Save rickdaalhuizen90/dbcf443f437fd0ce9e9df8cac36e3795 to your computer and use it in GitHub Desktop.
PHP script that removes duplicate fields in a csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
if ($argc < 2) { | |
exit('Error: No CSV file provided. Example usage: php script.php input.csv' . PHP_EOL); | |
} | |
if (pathinfo($argv[1], PATHINFO_EXTENSION) !== 'csv') { | |
exit('Error: Provided file is not a CSV.' . PHP_EOL); | |
} | |
$inputCsv = $argv[1]; | |
function parseCsv(string $csvFile): array { | |
return array_map('str_getcsv', file($csvFile)); | |
} | |
function findUniqueRows(array $data): array { | |
$serializedRows = array_map('serialize', $data); | |
$uniqueSerializedRows = array_unique($serializedRows); | |
return array_map('unserialize', $uniqueSerializedRows); | |
} | |
function updateCsv(string $csvFile, array $uniqueData): void { | |
$fileHandle = fopen($csvFile, 'w'); | |
foreach ($uniqueData as $fields) { | |
fputcsv($fileHandle, $fields); | |
} | |
fclose($fileHandle); | |
echo "CSV file updated with unique rows." . PHP_EOL; | |
} | |
function writeDuplicatesCsv(string $inputCsv, array $originalData, array $uniqueData): void { | |
$duplicateData = array_udiff($originalData, $uniqueData, 'strcmp'); | |
$duplicateCsvFile = pathinfo($inputCsv, PATHINFO_DIRNAME) . '/duplicates_' . pathinfo($inputCsv, PATHINFO_FILENAME) . '.csv'; | |
$fileHandle = fopen($duplicateCsvFile, 'w'); | |
foreach ($duplicateData as $fields) { | |
fputcsv($fileHandle, $fields); | |
} | |
fclose($fileHandle); | |
echo "Duplicates written to: $duplicateCsvFile" . PHP_EOL; | |
} | |
$data = parseCsv($inputCsv); | |
$uniqueData = findUniqueRows($data); | |
updateCsv($inputCsv, $uniqueData); | |
writeDuplicatesCsv($inputCsv, $data, $uniqueData); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Regarding comment on Line 23, can the unique function create a new csv file to write the duplicate values/rows?
Purpose: To compare the duplicates removed from the unique values for analysis.
Thank you.