Created
March 2, 2024 08:08
-
-
Save divinity76/3cab164c0c7a7be51fff4d77e1534bc0 to your computer and use it in GitHub Desktop.
csv merger script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
declare(strict_types=1); | |
function fix_encoding(string $str): string | |
{ | |
//return $str; | |
return mb_convert_encoding($str, 'UTF-8', 'ISO-8859-1'); | |
} | |
function parse_csv(string $csv, string $separator = ","): array | |
{ | |
$csv = strtr( | |
$csv, | |
[ | |
"\xEF\xBB\xBF" => "", // remove UTF-8 byte order masks, if present | |
"\r\n" => "\n", // Windows CrLf=> Unix Lf | |
"\r" => "\n" // old-MacOS Cr => Unix Lf | |
// (both modern MacOS and Linux use Lf .. Windows is the only outlier) | |
] | |
); | |
$lines = explode("\n", $csv); | |
$keys = str_getcsv(array_shift($lines), $separator); | |
$ret = array(); | |
foreach ($lines as $lineno => $line) { | |
if (strlen($line) < 1) { | |
// ... probably malformed csv, but we'll allow it | |
continue; | |
} | |
$parsed = str_getcsv($line, $separator); | |
if (count($parsed) !== count($keys)) { | |
throw new \RuntimeException("error on csv line #{$lineno}: count mismatch:" . count($parsed) . ' !== ' . count($keys) . ": " . var_export([ | |
'error' => 'count mismatch', | |
'keys' => $keys, | |
'parsed' => $parsed, | |
'line' => $line | |
], true)); | |
} | |
$ret[] = array_combine($keys, $parsed); | |
} | |
return $ret; | |
} | |
error_reporting(E_ALL); | |
ini_set('display_errors', '1'); | |
set_error_handler(function ($errno, $errstr, $errfile, $errline) { | |
if (error_reporting() & $errno) { | |
throw new \ErrorException($errstr, 0, $errno, $errfile, $errline); | |
} | |
}); | |
if ($argc !== 3) { | |
echo "Usage: php {$argv[0]} <file1.csv> <file2.csv>\n"; | |
exit(1); | |
} | |
$file1 = $argv[1]; | |
$file2 = $argv[2]; | |
$raw1 = file_get_contents($file1); | |
$raw2 = file_get_contents($file2); | |
$raw1 = fix_encoding($raw1); | |
$raw2 = fix_encoding($raw2); | |
$csv1 = parse_csv($raw1); | |
$csv2 = parse_csv($raw2); | |
$merged = []; | |
// Assuming both CSVs have the same headers and the merge is based on these headers. | |
$headers = array_keys($csv1[0] + $csv2[0]); // Combine and get unique headers from both CSVs | |
foreach ($csv1 as $index => $row) { | |
foreach ($headers as $header) { | |
// If the current header exists in both csv1 and csv2 rows | |
if (isset($csv2[$index][$header], $row[$header])) { | |
// If csv1's value is empty and csv2's value is not, take csv2's value | |
if (empty($row[$header]) && !empty($csv2[$index][$header])) { | |
$merged[$index][$header] = $csv2[$index][$header]; | |
} else { | |
// Otherwise, use csv1's value (even if it's empty, since both are equal priority in this case) | |
$merged[$index][$header] = $row[$header]; | |
} | |
} elseif (isset($row[$header])) { | |
// If the header exists only in csv1, take csv1's value | |
$merged[$index][$header] = $row[$header]; | |
} elseif (isset($csv2[$index][$header])) { | |
// If the header exists only in csv2, take csv2's value | |
$merged[$index][$header] = $csv2[$index][$header]; | |
} | |
} | |
} | |
// Handle the case where csv2 is longer than csv1 | |
if (count($csv2) > count($csv1)) { | |
for ($i = count($csv1); $i < count($csv2); $i++) { | |
foreach ($headers as $header) { | |
if (isset($csv2[$i][$header])) { | |
$merged[$i][$header] = $csv2[$i][$header]; | |
} | |
} | |
} | |
} | |
function array_to_csv(array $data, string $delimiter = ",", string $enclosure = '"', string $escapeChar = "\\"): string { | |
$fh = fopen('php://temp', 'rw'); // Write to a temporary file | |
foreach ($data as $row) { | |
fputcsv($fh, $row, $delimiter, $enclosure, $escapeChar); | |
} | |
rewind($fh); | |
$csvString = stream_get_contents($fh); | |
fclose($fh); | |
return $csvString; | |
} | |
$mergedCsvString = array_to_csv($merged); | |
// For demonstration, let's just print the merged CSV string | |
echo mb_convert_encoding($mergedCsvString, 'ISO-8859-1', 'UTF-8'); | |
//echo $mergedCsvString; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment