Last active
November 23, 2021 07:18
-
-
Save nimatrueway/0b25126406b49438f7cbede69691522b to your computer and use it in GitHub Desktop.
Little tool to fix overlapping subtitles (especially the ones extracted from english auto-subtitles of youtube, vtt files that you would convert to srt with ffmpeg)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "nima-scripts", | |
"version": "1.0.0", | |
"@comment dependencies": [ | |
"// argparse: powerful argument parser", | |
"// https://github.com/nodeca/argparse", | |
"// subtitle: subtitle parser", | |
"// https://github.com/gsantiago/subtitle.js" | |
], | |
"dependencies": { | |
"@types/argparse": "^2.0.10", | |
"argparse": "^2.0.1", | |
"subtitle": "4.1.1" | |
}, | |
"devDependencies": { | |
"@types/node": "^16.11.7", | |
"ts-node": "^10.4.0", | |
"typescript": "^4.4.4" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ts-node --transpile-only | |
import { ArgumentParser } from 'argparse'; | |
import { parseSync, stringifySync, NodeList, Node, Cue, NodeCue } from 'subtitle' | |
import { copyFileSync, readFileSync, writeFileSync } from 'fs' | |
type ParsedArgs = { | |
"srt-file": string | |
}; | |
function create_argument_parser(): ArgumentParser { | |
const parser = new ArgumentParser({ | |
description: 'A tool to fix overlapping subtitles (especially the ones extracted from english auto-subtitles of youtube, vtt files that you would convert to srt with ffmpeg)', | |
add_help: true | |
}); | |
parser.add_argument( | |
'srt-file', | |
{ | |
help: 'srt file to process and fix (a backup will be created as srt-file.bak)' | |
} | |
); | |
return parser; | |
} | |
/** | |
* An interface to allow you modify a node in a subtitle file. A node is a single subtitle text. | |
* Both parameters (`prev` and `current`) are modifiable and any change will impact the resulting | |
* subtitle file. | |
* | |
* @param prev useful if you are writing a context-aware subtitle modifier, | |
* this file provides the previous node | |
* @param current current node to process | |
* @return if true, `current` node will be added to the resulting subtitle. | |
* if false `current` node will be dropped. | |
*/ | |
interface IModifier { | |
(prev: Cue | null, current: Cue): boolean; | |
} | |
function traverse(nodes: NodeList, modifier: IModifier) { | |
var new_nodes: NodeCue[] = [] | |
// try to modify prev/current nodes, and drop them if modifier instructs so | |
function try_modify(prev: NodeCue | null, current: NodeCue) { | |
const shouldInclude = modifier(prev?.data ?? null, current.data); | |
if (shouldInclude == true) { | |
new_nodes = [...new_nodes, { | |
type: 'cue', | |
data: current.data | |
}] | |
} | |
} | |
var prev: NodeCue | null = null | |
// non-cue nodes are those that could not be processed. | |
const is_node_cue = (node: Node) => node.type == 'cue' | |
for (const current of nodes) { | |
if (is_node_cue(current)) { | |
try_modify(prev, current as NodeCue); | |
prev = current as NodeCue | |
} else { | |
prev = null | |
} | |
} | |
return new_nodes; | |
} | |
const fix: IModifier = function (prev: Cue | null, current: Cue): boolean { | |
// remove all beginning/trailing whitespace characters | |
current.text = current.text.replace(/^\s+|\s+$/g, ''); | |
// skip empty subtitles | |
if (current.text.trim().length == 0) { | |
return false; | |
} | |
// no further processing needed for first node | |
if (prev == null) { | |
return true; | |
} | |
// skip over super-short subtitles that basically contain what their previous subtitle contains, and just prolong previous subtitle | |
if (current.end - current.start < 150 && prev.text.indexOf(current.text) > -1) { | |
prev.end = current.end; | |
return false; | |
} | |
// if first-line of current subtitle is repeating last-line of previous-subtitle remove it | |
const currentLines = current.text.split(/\n/g) | |
const prevLines = prev.text.split(/\n/g) | |
if (currentLines[0] == prevLines[prevLines.length - 1]) { | |
current.text = currentLines.slice(1).join("\n") | |
} | |
// if first-line of current subtitle is repeating last-line of previous-subtitle remove it | |
if (current.start < prev.end) { | |
prev.end = current.start - 1 | |
} | |
return true; | |
} | |
const parsed_args = create_argument_parser().parse_args() as ParsedArgs | |
const file_content = readFileSync(parsed_args['srt-file'], { encoding: "utf-8" }) | |
const nodes = parseSync(file_content.toString()) | |
const modified_nodes = traverse(nodes, fix) | |
copyFileSync(parsed_args['srt-file'], `${parsed_args['srt-file']}.bak`); | |
writeFileSync(parsed_args['srt-file'], stringifySync(modified_nodes, { format: 'SRT' })); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment