Skip to content

Instantly share code, notes, and snippets.

@bemasher
Last active December 17, 2015 17:19
Show Gist options
  • Save bemasher/5644998 to your computer and use it in GitHub Desktop.
Save bemasher/5644998 to your computer and use it in GitHub Desktop.
Parses SubRip format subtitles using a finite state machine. Handles malformed index and time ranges.
package main
import (
"bufio"
"fmt"
"os"
"regexp"
"strings"
"sync"
"time"
)
const (
SUB_FILENAME = "example.srt"
)
// A state function takes a lexer and returns a state function
type stateFn func(*lexer) stateFn
// Lexer contains a bufio reader to parse input from
// stores current and previously parsed subtitle
// and a results channel for returning parsed subtitles
type lexer struct {
*bufio.Scanner
lineNumber int
prev, s Subtitle
eof bool
results chan Subtitle
done chan struct{}
}
// Subtitles consist of an index, a start and end time, and a
// list of strings containing the lines of the subtitle
type Subtitle struct {
Idx int
Start, Stop time.Duration
Lines []string
}
func (s Subtitle) ApplyLines(f func(string) string) {
for i, _ := range s.Lines {
s.Lines[i] = f(s.Lines[i])
}
}
// func (s Subtitle) String() string {
// r := fmt.Sprintf("%d\r\n", s.Idx)
// r += fmt.Sprintf("%s --> %s\r\n", FormatDuration(s.Start), FormatDuration(s.Stop))
// for _, line := range s.Lines {
// r += line + "\r\n"
// }
// r += "\r\n"
// return r
// }
func FormatDuration(d time.Duration) (r string) {
r = fmt.Sprintf("%02d:", d/time.Hour)
r += fmt.Sprintf("%02d:", (d%time.Hour)/time.Minute)
r += fmt.Sprintf("%02d,", (d%time.Minute)/time.Second)
r += fmt.Sprintf("%03d", (d%time.Second)/time.Millisecond)
return
}
// Parse and return a time.Duration
func ParseTime(r string) (time.Duration, error) {
var hour, minute, second, milli int
_, err := fmt.Sscanf(r, "%d:%d:%d,%d", &hour, &minute, &second, &milli)
d := time.Duration(hour) * time.Hour
d += time.Duration(minute) * time.Minute
d += time.Duration(second) * time.Second
d += time.Duration(milli) * time.Millisecond
return d, err
}
// Parse a subtitle index
func Idx(l *lexer) stateFn {
var err error
if err = l.Err(); err != nil {
fmt.Println("Error reading index:%d: %s\n", l.lineNumber, err)
return Exit
}
_, err = fmt.Sscanf(strings.TrimSpace(l.Text()), "%d", &l.s.Idx)
// Unknown error parsing index
if err != nil {
fmt.Printf("Error parsing index:%d: %s\n", l.lineNumber, err)
return Exit
}
// Indexes must be monotonically increasing
if l.s.Idx <= l.prev.Idx {
fmt.Printf("Invalid index, expected:%d: %d < %d\n", l.lineNumber, l.prev.Idx, l.s.Idx)
return Exit
}
// Parse the start time next
return TimeRange
}
// Parse the subtitle's time range
func TimeRange(l *lexer) stateFn {
var err error
if err = l.Err(); err != nil {
fmt.Println("Error reading time range:%d: %s\n", l.lineNumber, err)
return Exit
}
line := strings.TrimSpace(l.Text())
// Split on the time divider
times := strings.Split(line, " --> ")
// Should have exactly two fields after splitting
if len(times) != 2 {
fmt.Printf("Error splitting time range:%d: %q\n", l.lineNumber, line)
return Exit
}
// Parse the start time, exit on any error
l.s.Start, err = ParseTime(times[0])
if err != nil {
fmt.Printf("Error parsing start time:%d:%s\n", l.lineNumber, err)
return Exit
}
// Parse the end time, exit on any error
l.s.Stop, err = ParseTime(times[1])
if err != nil {
fmt.Printf("Error parsing stop time:%d:%s\n", l.lineNumber, err)
return Exit
}
// Handle any invalid time ranges
if l.s.Start >= l.s.Stop {
fmt.Printf("Invalid time range, expected:%d: %s < %s\n", l.lineNumber, l.s.Start, l.s.Stop)
return Exit
}
if l.s.Start <= l.prev.Stop {
fmt.Printf("Time range overlaps previous subtitle, expected:%d: %s < %s\n", l.lineNumber, l.prev.Stop, l.s.Start)
return Exit
}
return Line
}
// Parse subtitle lines
func Line(l *lexer) (s stateFn) {
var err error
if err = l.Err(); err != nil {
fmt.Println("Error reading line:%d: %s\n", l.lineNumber, err)
return Exit
}
line := strings.TrimSpace(l.Text())
// If the line isn't empty after trimming, append
// it to the subtitle and parse another line
if len(line) > 0 && !l.eof {
l.s.Lines = append(l.s.Lines, line)
return Line
}
// We haven't exited yet so line must be empty, send the
// current subtitle on the result channel. Make a new
// subtitle and store the current as the previous
l.results <- l.s
l.prev, l.s = l.s, Subtitle{}
if l.eof {
return Exit
}
// Parse a new subtitle starting with it's index
return Idx
}
// Close the result channel, signal we're done and exit the state machine by returning nil
func Exit(l *lexer) stateFn {
close(l.results)
l.done <- struct{}{}
return nil
}
func main() {
// Open subrip file
subFile, err := os.Open(SUB_FILENAME)
if err != nil {
fmt.Println("Error opening subtitle:", err)
os.Exit(1)
}
defer subFile.Close()
var lexer lexer
lexer.Scanner = bufio.NewScanner(subFile)
lexer.results = make(chan Subtitle, 1)
lexer.done = make(chan struct{})
// Start the result consumer
wg := new(sync.WaitGroup)
wg.Add(1)
go func() {
// Regex to remove style tags
re := regexp.MustCompilePOSIX("<([^<.]+?)>")
for sub := range lexer.results {
// Remove all style tags
sub.ApplyLines(func(i string) string {
return re.ReplaceAllString(i, "")
})
fmt.Printf("%+v\n", sub)
}
}()
// Start the subtitle parser
go func() {
// Parsing begins with an index, continue until
// the state machine exits.
for state := Idx; state != nil; {
lexer.eof = !lexer.Scan()
lexer.lineNumber++
state = state(&lexer)
}
}()
// Wait for state machine to exit and parsed subtitles to be printed
<-lexer.done
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment