Skip to content

Instantly share code, notes, and snippets.

@tracker1
Last active April 20, 2017 21:49
Show Gist options
  • Save tracker1/bf1bdbe39699fa497bb993ac0cd3b359 to your computer and use it in GitHub Desktop.
Save tracker1/bf1bdbe39699fa497bb993ac0cd3b359 to your computer and use it in GitHub Desktop.
iconv-lite encoding detection
// TODO: add process.nextTick foo so that this doesn't block as long
import iconv from 'iconv-lite';
// UTF8 BOM - 0xEF, 0xBB, 0xBF
// UTF16 BOM - 0xFE, 0xFF
// UTF16le BOM = 0xFF, 0xFE
export default async function readTextFromBuffer(buffer) {
if (!(buffer instanceof Buffer)) throw new Error('Input is not a buffer');
if (!buffer.length) return '';
const result = readWithBOM(buffer) || readWithoutBOM(buffer);
if (!result) return null;
return result;
}
function hasBOM(buffer, check) {
if (buffer.length < check.length) return false;
return check.every((v,i) => v == buffer[i]);
}
function readWithBOM(buffer) {
// Check for BOM
if (hasBOM(buffer, [0xEF, 0xBB, 0xBF])) {
// utf8
return buffer.toString('utf8');
}
if (hasBOM(buffer, [0xFE, 0xFF])) {
// utf16 be
return iconv.decode(buffer, 'utf16-be');
}
if (hasBOM(buffer, [0xFF, 0xFE])) {
// utf16 le
return iconv.decode(buffer, 'utf16-le');
}
}
function readWithoutBOM(buffer) {
let result;
try {
result = buffer.toString('utf8');
// TODO: remove final 2-3 characters in case of partial buffer, to not throw detection off.
} catch(err) {
result = null;
}
// has unknown "unicode" character - try win1252 encoding
if (!result || result.includes(String.fromCharCode(65533))) {
// TODO: check the buffer for "unknown" directly before win1252 conversion
const result2 = iconv.decode(buffer, 'win1252');
// only replace if loaded as win1252 does *not* have unicode "unknown" character,
if (!result2.includes("\xEF\xBF\xBD")) {
return result2 || null;
}
}
return result || null;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment