Last active
March 16, 2025 14:47
-
-
Save nimatrueway/4589700f49c691e5413c5b2df4d02f4f to your computer and use it in GitHub Desktop.
Little tool to fix overlapping subtitles (especially the ones extracted from english auto-subtitles of youtube, vtt files that you would convert to srt with ffmpeg)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"time" | |
"regexp" | |
"bufio" | |
"strconv" | |
"fmt" | |
"os" | |
"errors" | |
"io" | |
"strings" | |
) | |
type Subtitle struct { | |
idx int | |
fromTime time.Duration | |
toTime time.Duration | |
text string | |
} | |
var timeFramePattern, _ = regexp.Compile(`(\d+):(\d+):(\d+),(\d+) --> (\d+):(\d+):(\d+),(\d+)`) | |
func getDuration(parts []string) time.Duration { | |
hour, _ := strconv.Atoi(parts[0]) | |
minute, _ := strconv.Atoi(parts[1]) | |
second, _ := strconv.Atoi(parts[2]) | |
millisecond, _ := strconv.Atoi(parts[3]) | |
return time.Millisecond * time.Duration(millisecond) + | |
time.Second * time.Duration(second) + | |
time.Minute * time.Duration(minute) + | |
time.Hour * time.Duration(hour) | |
} | |
func printDuration(duration time.Duration) string { | |
hour := duration / time.Hour | |
duration -= hour * time.Hour | |
minute := duration / time.Minute | |
duration -= minute * time.Minute | |
second := duration / time.Second | |
duration -= second * time.Second | |
millisecond := duration / time.Millisecond | |
return fmt.Sprintf(`%02d:%02d:%02d,%03d`, hour, minute, second, millisecond) | |
} | |
func readOneSubtitle(scanner *bufio.Scanner) (*Subtitle, error) { | |
// read idx | |
if !scanner.Scan() { | |
return nil, nil | |
} | |
idxRaw := scanner.Text() | |
idx, err := strconv.Atoi(idxRaw) | |
if err != nil { | |
return nil, errors.New("invalid subtitle index") | |
} | |
// read timing | |
if !scanner.Scan() { | |
return nil, errors.New("could not find subtitle timing") | |
} | |
timing := timeFramePattern.FindStringSubmatch(scanner.Text()) | |
if timing == nil { | |
return nil, errors.New("invalid subtitle timing") | |
} | |
fromTime := getDuration(timing[1:5]) | |
toTime := getDuration(timing[5:9]) | |
// read content | |
if !scanner.Scan() { | |
return nil, errors.New("could not find subtitle text") | |
} | |
content := scanner.Text() | |
for scanner.Scan() && scanner.Text() != "" { | |
content += "\n" | |
content += scanner.Text() | |
} | |
subtitle := &Subtitle{idx, fromTime, toTime, content} | |
return subtitle, nil | |
} | |
func writeOneSubtitle(file io.Writer, subtitle *Subtitle, idx *int) error { | |
_, err := fmt.Fprint(file, | |
*idx, "\n", | |
printDuration(subtitle.fromTime), " --> ", printDuration(subtitle.toTime), "\n", | |
subtitle.text, "\n\n") | |
*idx++ | |
return err | |
} | |
func main() { | |
if len(os.Args) < 2 { | |
println("Provide a subtitle file to fix.\ne.g. subtitle-fixer mysubtitle.srt") | |
return | |
} | |
filePath := os.Args[1] | |
newFilePath := filePath + ".fixed" | |
file, _ := os.Open(filePath) | |
newFile, _ := os.Create(newFilePath) | |
defer file.Close() | |
defer newFile.Close() | |
scanner := bufio.NewScanner(file) | |
var newIdx = 1 | |
var lastSubtitle *Subtitle = nil | |
for { | |
subtitle, err := readOneSubtitle(scanner) | |
if lastSubtitle != nil { | |
if subtitle != nil { | |
subtitle.text = strings.Trim(subtitle.text, "\n ") | |
if len(subtitle.text) == 0 { // skip over empty subtitles | |
continue | |
} | |
// skip over super-short subtitles that basically contain what their previous subtitle contains, and just prolong previous subtitle | |
if subtitle.toTime - subtitle.fromTime < time.Millisecond * 150 && | |
strings.Contains(lastSubtitle.text, subtitle.text) { | |
lastSubtitle.toTime = subtitle.toTime | |
continue | |
} | |
// if first-line of current subtitle is repeating last-line of previous-subtitle remove it | |
currentLines := strings.Split(subtitle.text, "\n") | |
lastLines := strings.Split(lastSubtitle.text, "\n") | |
if currentLines[0] == lastLines[len(lastLines)-1] { | |
subtitle.text = strings.Join(currentLines[1:], "\n") | |
} | |
// if first-line of current subtitle is repeating last-line of previous-subtitle remove it | |
if subtitle.fromTime < lastSubtitle.toTime { | |
lastSubtitle.toTime = subtitle.fromTime - time.Millisecond | |
} | |
} | |
writeOneSubtitle(newFile, lastSubtitle, &newIdx) | |
} | |
if subtitle == nil { | |
break | |
} | |
if err != nil { | |
panic(err) | |
} | |
lastSubtitle = subtitle | |
} | |
os.Rename(filePath, filePath + ".bak") | |
os.Rename(newFilePath, filePath) | |
} |
For anyone else who would find this useful:
I had an issue with my SRT file being rejected by a picky program for having occasional blank entries like
42 00:03:14,000 --> 00:03:14,159
I moved the section at https://gist.github.com/nimatrueway/4589700f49c691e5413c5b2df4d02f4f#file-subtitle-overlap-fixer-go-L111-L113 down to the end of the block to perform this check last. My thinking is that I was running into issues at line https://gist.github.com/nimatrueway/4589700f49c691e5413c5b2df4d02f4f#file-subtitle-overlap-fixer-go-L124 that subverted the previous empty line check. In any case, pushing this section down seemed to resolve my issues!
Cheers to @nimatrueway for this awesome script. It saved me a ton of time!
Thanks fork it
Thank you for sharing this, @nimatrueway.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
AWESOME! Did exactly what I needed. There were some additional fixes I needed to do (sentence capitalization and changing lowercase "i" to uppercase "I" where needed.). I wrote a little OS X bash script for these issues if anyone want to try it. https://github.com/bruno-sardine/mac#Further-correct-YouTube-captions-captfixsh