Last active
April 20, 2017 21:49
-
-
Save tracker1/bf1bdbe39699fa497bb993ac0cd3b359 to your computer and use it in GitHub Desktop.
iconv-lite encoding detection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// TODO: add process.nextTick foo so that this doesn't block as long | |
import iconv from 'iconv-lite'; | |
// UTF8 BOM - 0xEF, 0xBB, 0xBF | |
// UTF16 BOM - 0xFE, 0xFF | |
// UTF16le BOM = 0xFF, 0xFE | |
export default async function readTextFromBuffer(buffer) { | |
if (!(buffer instanceof Buffer)) throw new Error('Input is not a buffer'); | |
if (!buffer.length) return ''; | |
const result = readWithBOM(buffer) || readWithoutBOM(buffer); | |
if (!result) return null; | |
return result; | |
} | |
function hasBOM(buffer, check) { | |
if (buffer.length < check.length) return false; | |
return check.every((v,i) => v == buffer[i]); | |
} | |
function readWithBOM(buffer) { | |
// Check for BOM | |
if (hasBOM(buffer, [0xEF, 0xBB, 0xBF])) { | |
// utf8 | |
return buffer.toString('utf8'); | |
} | |
if (hasBOM(buffer, [0xFE, 0xFF])) { | |
// utf16 be | |
return iconv.decode(buffer, 'utf16-be'); | |
} | |
if (hasBOM(buffer, [0xFF, 0xFE])) { | |
// utf16 le | |
return iconv.decode(buffer, 'utf16-le'); | |
} | |
} | |
function readWithoutBOM(buffer) { | |
let result; | |
try { | |
result = buffer.toString('utf8'); | |
// TODO: remove final 2-3 characters in case of partial buffer, to not throw detection off. | |
} catch(err) { | |
result = null; | |
} | |
// has unknown "unicode" character - try win1252 encoding | |
if (!result || result.includes(String.fromCharCode(65533))) { | |
// TODO: check the buffer for "unknown" directly before win1252 conversion | |
const result2 = iconv.decode(buffer, 'win1252'); | |
// only replace if loaded as win1252 does *not* have unicode "unknown" character, | |
if (!result2.includes("\xEF\xBF\xBD")) { | |
return result2 || null; | |
} | |
} | |
return result || null; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment