Created
July 1, 2015 20:27
-
-
Save nfriedly/c4c41c0b053a1106ebe1 to your computer and use it in GitHub Desktop.
iconv html stream decoder (reads charset from <meta> tag, uses it to decode document to utf8)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// buffers a stream of html untill it sees a charset meta tag (or opening <?xml tag with an encoding) | |
// then it creates an iconv-lite decoder for the charset and sends all data (the buffer and any future data) through it, emiting node.js-friendly utf8 | |
// if it cannot find a charset by the time the </head> tag is reached, it gives up and just calls .toString() on each chunk hoping that it's in a format that node.js can read | |
// based on iconv-lite's decodeStream | |
// todo: clean this up, add some tests, and stick it on npm | |
// == Decoder stream ======================================================= | |
function HTMLDecodeStream(options) { | |
this.buff = new Buffer([]); | |
this.isBuffering = true; | |
this.conv = null; | |
options = options || {}; | |
this.inputEncoding = 'utf8'; | |
this.encoding = options.encoding = 'utf8'; // this is the *output* encoding | |
this.conv = iconv.getEncoder(this.inputEncoding); | |
Transform.call(this, options); | |
} | |
HTMLDecodeStream.prototype = Object.create(Transform.prototype, { | |
constructor: { | |
value: HTMLDecodeStream | |
} | |
}); | |
HTMLDecodeStream.prototype._transform = function(chunk, encoding, done) { | |
if (!Buffer.isBuffer(chunk)) | |
return done(new Error("delayed decoding stream needs buffers as its input.")); | |
if (this.isBuffering) { | |
this.bufferAndTest(chunk, encoding, done); | |
} else { | |
this.stream(chunk, encoding, done); | |
} | |
}; | |
HTMLDecodeStream.prototype.stream = function(chunk, encoding, done) { | |
try { | |
var res = this.conv.write(chunk); | |
if (res && res.length) this.push(res, this.encoding); | |
done(); | |
} catch (e) { | |
done(e); | |
} | |
}; | |
HTMLDecodeStream.prototype.bufferAndTest = function(chunk, encoding, done) { | |
this.buff = Buffer.concat([this.buff, chunk]); | |
var str = this.buff.toString(); | |
var charsetMatch = str.match(/<meta [^>]*charset=['"]?([^ '">]+)/) || str.match(/<\?xml[^>]+encoding="([^">]+)"/); // extract the charset from a meta tag or the opening <?xml tag | |
var endOfHead = str.match(/<\/head>/); | |
if (charsetMatch) { | |
this.startStreaming(charsetMatch[1], encoding, done); | |
} else if (endOfHead) { | |
// go with the safest guess for the charset | |
this.startStreaming('utf8', encoding, done); | |
} | |
}; | |
HTMLDecodeStream.prototype.startStreaming = function(charset, encoding, done) { | |
// setup the decoder | |
if (iconv.encodingExists(charset)) { | |
this.inputEncoding = charset; | |
this.conv = iconv.getDecoder(this.inputEncoding); | |
} else { | |
console.error("unrecognized charset %s, decoding as utf8", this.inputEncoding); | |
} | |
this.emit('charset', this.inputEncoding); | |
this.isBuffering = false; | |
// decode and forward our existing buffer | |
this.stream(this.buff, encoding, done); | |
// cleanup to ensure _flush doesn't accidentally send data twice | |
this.buff = null; | |
}; | |
HTMLDecodeStream.prototype._flush = function(done) { | |
var res; | |
try { | |
if (this.buff) { | |
res = this.conv.write(this.buff); | |
if (res && res.length) this.push(res, this.encoding); | |
this.buff = null; | |
} | |
res = this.conv.end(); | |
if (res && res.length) this.push(res, this.encoding); | |
done(); | |
} catch (e) { | |
done(e); | |
} | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment