Skip to content

Instantly share code, notes, and snippets.

@JavadocMD
Created August 1, 2020 19:16
Show Gist options
  • Save JavadocMD/0ed6fca7edb9251a2246f20fd53bfe7d to your computer and use it in GitHub Desktop.
Save JavadocMD/0ed6fca7edb9251a2246f20fd53bfe7d to your computer and use it in GitHub Desktop.
Removes repeated lines from the input.
import { Transform } from 'stream'
import { StringDecoder } from 'string_decoder'
const split = (r: RegExp = /\r?\n/) => {
const enc = 'utf8'
const dec = new StringDecoder(enc)
let prev: string = ''
return new Transform({
defaultEncoding: enc,
transform(chunk, e, callback) {
if ((e as any) !== 'buffer') {
callback(new Error(`split expected a buffer, but got ${e}`))
return
}
const xs = (prev + dec.write(chunk)).split(r)
if (xs.length === 1) {
prev = xs[0]
} else {
prev = xs.pop() || ''
for (const x of xs) {
this.push(x)
}
}
callback()
},
flush(callback) {
const x = prev + dec.end()
this.push(x)
callback()
},
})
}
/** Removes lines that are repeats of the previous line. */
const filterNonsequential = () => {
let prev: Buffer = Buffer.alloc(0)
return new Transform({
transform(chunk, e, callback) {
if ((e as any) !== 'buffer') {
callback(new Error(`filterNonsequential expected a buffer, but got ${e}`))
return
}
let nonseq = true
if (prev.length === chunk.length) {
nonseq = false
for (var i = 0; i < chunk.length; i++) {
if (chunk[i] !== prev[i]) {
nonseq = true
break
}
}
}
if (nonseq) {
prev = chunk
this.push(chunk)
}
callback()
},
})
}
const join = (sep: string = '\n') => {
const enc = 'utf8'
let isFirst = true
return new Transform({
defaultEncoding: enc,
transform(chunk, e, callback) {
if ((e as any) !== 'buffer') {
callback(new Error(`join expected a buffer, but got ${e}`))
return
}
if (isFirst) {
this.push(chunk)
isFirst = false
} else {
this.push(sep + chunk.toString(enc))
}
callback()
},
})
}
const lastly = (output: string) => {
return new Transform({
transform(chunk, _, callback) {
callback(null, chunk)
},
flush(callback) {
callback(null, Buffer.from(output, 'utf8'))
},
})
}
// Usage example:
// $ unzip -c input-file.json.zip | node ./dedupe.js > output-file.json
process.stdin //
.pipe(split()) //
.pipe(filterNonsequential()) //
.pipe(join()) //
.pipe(lastly('\n')) //
.pipe(process.stdout)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment