Skip to content

Instantly share code, notes, and snippets.

@kopiro
Created March 30, 2015 21:38
Show Gist options
  • Save kopiro/4431f2c59af979223d76 to your computer and use it in GitHub Desktop.
Save kopiro/4431f2c59af979223d76 to your computer and use it in GitHub Desktop.
WARC parser improved with \n
var stream = require('stream'),
util = require('util');
module.exports = WARCStream;
var STATE = {
PROTOCOL : 1,
HEADERS : 2,
CONTENT : 3,
SEPARATOR : 4
};
var headerRegex = /^([^\:]+)\: ([^$]+)$/;
function WARCStream(opts) {
if (!(this instanceof WARCStream)) {
return new WARCStream();
}
if (typeof opts === 'undefined') {
opts = {};
}
if (typeof opts.objectMode === 'undefined') {
opts.objectMode = true;
}
stream.Transform.call(this, opts);
this.state = STATE.PROTOCOL;
this.data = new Buffer(0);
this.content = new Buffer(0);
this.separator = [ new Buffer('\r\n\r\n'), new Buffer('\n\n'), new Buffer('\r\r') ];
this.offset = 0;
this.protocol = null;
this.headers = {};
this.contentLength = 0;
this.matcher = [ new Buffer('\r\n'), new Buffer('\n'), new Buffer('\r') ];
}
util.inherits(WARCStream, stream.Transform);
WARCStream.prototype._transform = function (chunk, enc, cb) {
var result;
// append chunk
this.data = Buffer.concat([this.data, chunk]);
do {
switch (this.state) {
case STATE.PROTOCOL:
this.protocol = null;
result = this.parseProtocol();
if (result) {
this.state = STATE.HEADERS;
this.headers = {};
this.emit('protocol', this.protocol);
}
break;
case STATE.HEADERS:
result = false;
result = this.parseHeaders();
if (result) {
this.contentLength = parseInt(this.headers['Content-Length']);
this.content = new Buffer(0);
this.emit('headers', this.headers);
}
break;
case STATE.CONTENT:
result = this.parseContent();
if (result) {
this.state = STATE.SEPARATOR;
this.emit('content', this.content);
this.push({
protocol: this.protocol,
headers: this.headers,
content: this.content
});
}
break;
case STATE.SEPARATOR:
result = this.parseSeparator();
if (result) {
this.state = STATE.PROTOCOL;
}
break;
default:
result = false;
break;
}
} while (result && this.offset < this.data.length);
// store only the part we haven't processed yet
this.data = this.data.slice(this.offset);
this.offset = 0;
cb();
};
WARCStream.prototype._flush = function (cb) {
cb();
};
WARCStream.prototype.parseProtocol = function () {
var idx = firstMatch(this.matcher, this.data, this.offset);
if (idx.index !== false && idx.index <= this.data.length) {
var protocol = this.data.slice(this.offset, idx.index);
this.offset = idx.index + idx.matcher.length;
this.protocol = protocol.toString();
return true;
} else {
return false;
}
};
WARCStream.prototype.parseHeaders = function () {
var result;
do {
result = this.parseHeader();
} while (result);
return !result && this.state === STATE.CONTENT;
};
WARCStream.prototype.parseHeader = function () {
var idx = firstMatch(this.matcher, this.data, this.offset);
if (idx.index !== false && idx.index < this.data.length) {
var header= this.data.slice(this.offset, idx.index);
this.offset = idx.index + idx.matcher.length;
if (header.length === 0) {
this.state = STATE.CONTENT;
return false;
}
var m = headerRegex.exec(header.toString());
if (m) {
this.headers[m[1]] = m[2];
}
return true;
} else {
return false;
}
};
WARCStream.prototype.parseContent = function () {
var appendLength = Math.min(
this.data.length - this.offset,
this.contentLength - this.content.length);
this.content = Buffer.concat([
this.content, this.data.slice(this.offset, this.offset + appendLength)]);
this.offset += appendLength;
return this.contentLength === this.content.length;
};
WARCStream.prototype.parseSeparator = function () {
var idx = firstMatch(this.separator, this.data, this.offset);
if (idx.index !== false && idx.index < this.data.length) {
var separator = this.data.slice(this.offset, idx.index);
this.offset = idx.index + idx.matcher.length;
if (separator.length === 0) {
return true;
}
}
return true;
};
function firstMatch(matcher, buf, offset) {
var i = offset;
if (offset >= buf.length) return false;
for (var i = offset; i < buf.length; i++) {
for (var dc = 0; dc < matcher.length; dc++) {
var m = matcher[dc];
if (buf[i] === m[0]) {
if (m.length > 1) {
var fullMatch = true;
for (var j = i, k = 0; j < i + m.length; j++, k++) {
if (buf[j] !== m[k]) {
fullMatch = false;
break;
}
}
if (fullMatch) return { index: j - m.length, matcher: m };
}
return { index: i + m.length - 1, matcher: m };
}
}
}
return {};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment