Created
August 1, 2020 19:16
-
-
Save JavadocMD/0ed6fca7edb9251a2246f20fd53bfe7d to your computer and use it in GitHub Desktop.
Removes repeated lines from the input.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { Transform } from 'stream' | |
import { StringDecoder } from 'string_decoder' | |
const split = (r: RegExp = /\r?\n/) => { | |
const enc = 'utf8' | |
const dec = new StringDecoder(enc) | |
let prev: string = '' | |
return new Transform({ | |
defaultEncoding: enc, | |
transform(chunk, e, callback) { | |
if ((e as any) !== 'buffer') { | |
callback(new Error(`split expected a buffer, but got ${e}`)) | |
return | |
} | |
const xs = (prev + dec.write(chunk)).split(r) | |
if (xs.length === 1) { | |
prev = xs[0] | |
} else { | |
prev = xs.pop() || '' | |
for (const x of xs) { | |
this.push(x) | |
} | |
} | |
callback() | |
}, | |
flush(callback) { | |
const x = prev + dec.end() | |
this.push(x) | |
callback() | |
}, | |
}) | |
} | |
/** Removes lines that are repeats of the previous line. */ | |
const filterNonsequential = () => { | |
let prev: Buffer = Buffer.alloc(0) | |
return new Transform({ | |
transform(chunk, e, callback) { | |
if ((e as any) !== 'buffer') { | |
callback(new Error(`filterNonsequential expected a buffer, but got ${e}`)) | |
return | |
} | |
let nonseq = true | |
if (prev.length === chunk.length) { | |
nonseq = false | |
for (var i = 0; i < chunk.length; i++) { | |
if (chunk[i] !== prev[i]) { | |
nonseq = true | |
break | |
} | |
} | |
} | |
if (nonseq) { | |
prev = chunk | |
this.push(chunk) | |
} | |
callback() | |
}, | |
}) | |
} | |
const join = (sep: string = '\n') => { | |
const enc = 'utf8' | |
let isFirst = true | |
return new Transform({ | |
defaultEncoding: enc, | |
transform(chunk, e, callback) { | |
if ((e as any) !== 'buffer') { | |
callback(new Error(`join expected a buffer, but got ${e}`)) | |
return | |
} | |
if (isFirst) { | |
this.push(chunk) | |
isFirst = false | |
} else { | |
this.push(sep + chunk.toString(enc)) | |
} | |
callback() | |
}, | |
}) | |
} | |
const lastly = (output: string) => { | |
return new Transform({ | |
transform(chunk, _, callback) { | |
callback(null, chunk) | |
}, | |
flush(callback) { | |
callback(null, Buffer.from(output, 'utf8')) | |
}, | |
}) | |
} | |
// Usage example: | |
// $ unzip -c input-file.json.zip | node ./dedupe.js > output-file.json | |
process.stdin // | |
.pipe(split()) // | |
.pipe(filterNonsequential()) // | |
.pipe(join()) // | |
.pipe(lastly('\n')) // | |
.pipe(process.stdout) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment