Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save thaarok/23cf439207103961b8e7df839abdaab4 to your computer and use it in GitHub Desktop.
Save thaarok/23cf439207103961b8e7df839abdaab4 to your computer and use it in GitHub Desktop.
Magic functions to detect encoding and separator of CSV file and parsing it
<?php
function parseCsv($file) {
$content = file_get_contents($file);
if (!$content) {
throw Exception('Unable to read the CSV file');
}
$encoding = detectEncoding($content);
$content = iconv($encoding, 'UTF-8//TRANSLIT', $content);
echo $content."\n\n";
$fileSample = substr($content, 0, 100);
$separator = ';';
if (strpos($fileSample, ',') !== false) {
$separator = ',';
}
$csv = [];
$fp = fopen("php://temp", 'r+');
fputs($fp, $content);
rewind($fp);
// parse header
$cols = [];
if (($data = fgetcsv($fp, 10000, $separator)) !== false) {
$cols = $data;
} else {
throw Exception('No content found in the CSV file');
}
// parse rows
while (($data = fgetcsv($fp, 10000, $separator)) !== false) {
$row = [];
foreach ($data as $i => $value) {
$row[$cols[$i]] = $value;
}
$csv[] = $row;
}
fclose($fp);
return $csv;
}
function detectEncoding($content) {
// try to detect BOM first
$first2 = substr($content, 0, 2);
$first3 = substr($content, 0, 3);
$first4 = substr($content, 0, 3);
$UTF8_BOM = chr(0xEF) . chr(0xBB) . chr(0xBF);
$UTF32_BIG_ENDIAN_BOM = chr(0x00) . chr(0x00) . chr(0xFE) . chr(0xFF);
$UTF32_LITTLE_ENDIAN_BOM = chr(0xFF) . chr(0xFE) . chr(0x00) . chr(0x00);
$UTF16_BIG_ENDIAN_BOM = chr(0xFE) . chr(0xFF);
$UTF16_LITTLE_ENDIAN_BOM = chr(0xFF) . chr(0xFE);
if ($first3 == $UTF8_BOM) return 'UTF-8';
elseif ($first4 == $UTF32_BIG_ENDIAN_BOM) return 'UTF-32BE';
elseif ($first4 == $UTF32_LITTLE_ENDIAN_BOM) return 'UTF-32LE';
elseif ($first2 == $UTF16_BIG_ENDIAN_BOM) return 'UTF-16BE';
elseif ($first2 == $UTF16_LITTLE_ENDIAN_BOM) return 'UTF-16LE';
// when no BOM available, use mb_detect_encoding
$encoding = mb_detect_encoding($content, 'auto');
if ($encoding == 'UTF-8') { // returned for any single-byte encoding
if (preg_match('!!u', $content)) {
return 'UTF-8';
}
// if not valid UTF-8, suppose Windows encoding
return 'Windows-1250';
}
return $encoding;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment