Created
April 4, 2022 21:25
-
-
Save thaarok/23cf439207103961b8e7df839abdaab4 to your computer and use it in GitHub Desktop.
Magic functions to detect encoding and separator of CSV file and parsing it
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
function parseCsv($file) { | |
$content = file_get_contents($file); | |
if (!$content) { | |
throw Exception('Unable to read the CSV file'); | |
} | |
$encoding = detectEncoding($content); | |
$content = iconv($encoding, 'UTF-8//TRANSLIT', $content); | |
echo $content."\n\n"; | |
$fileSample = substr($content, 0, 100); | |
$separator = ';'; | |
if (strpos($fileSample, ',') !== false) { | |
$separator = ','; | |
} | |
$csv = []; | |
$fp = fopen("php://temp", 'r+'); | |
fputs($fp, $content); | |
rewind($fp); | |
// parse header | |
$cols = []; | |
if (($data = fgetcsv($fp, 10000, $separator)) !== false) { | |
$cols = $data; | |
} else { | |
throw Exception('No content found in the CSV file'); | |
} | |
// parse rows | |
while (($data = fgetcsv($fp, 10000, $separator)) !== false) { | |
$row = []; | |
foreach ($data as $i => $value) { | |
$row[$cols[$i]] = $value; | |
} | |
$csv[] = $row; | |
} | |
fclose($fp); | |
return $csv; | |
} | |
function detectEncoding($content) { | |
// try to detect BOM first | |
$first2 = substr($content, 0, 2); | |
$first3 = substr($content, 0, 3); | |
$first4 = substr($content, 0, 3); | |
$UTF8_BOM = chr(0xEF) . chr(0xBB) . chr(0xBF); | |
$UTF32_BIG_ENDIAN_BOM = chr(0x00) . chr(0x00) . chr(0xFE) . chr(0xFF); | |
$UTF32_LITTLE_ENDIAN_BOM = chr(0xFF) . chr(0xFE) . chr(0x00) . chr(0x00); | |
$UTF16_BIG_ENDIAN_BOM = chr(0xFE) . chr(0xFF); | |
$UTF16_LITTLE_ENDIAN_BOM = chr(0xFF) . chr(0xFE); | |
if ($first3 == $UTF8_BOM) return 'UTF-8'; | |
elseif ($first4 == $UTF32_BIG_ENDIAN_BOM) return 'UTF-32BE'; | |
elseif ($first4 == $UTF32_LITTLE_ENDIAN_BOM) return 'UTF-32LE'; | |
elseif ($first2 == $UTF16_BIG_ENDIAN_BOM) return 'UTF-16BE'; | |
elseif ($first2 == $UTF16_LITTLE_ENDIAN_BOM) return 'UTF-16LE'; | |
// when no BOM available, use mb_detect_encoding | |
$encoding = mb_detect_encoding($content, 'auto'); | |
if ($encoding == 'UTF-8') { // returned for any single-byte encoding | |
if (preg_match('!!u', $content)) { | |
return 'UTF-8'; | |
} | |
// if not valid UTF-8, suppose Windows encoding | |
return 'Windows-1250'; | |
} | |
return $encoding; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment