Last active
December 17, 2015 17:19
-
-
Save bemasher/5644998 to your computer and use it in GitHub Desktop.
Parses SubRip format subtitles using a finite state machine. Handles malformed index and time ranges.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"bufio" | |
"fmt" | |
"os" | |
"regexp" | |
"strings" | |
"sync" | |
"time" | |
) | |
const ( | |
SUB_FILENAME = "example.srt" | |
) | |
// A state function takes a lexer and returns a state function | |
type stateFn func(*lexer) stateFn | |
// Lexer contains a bufio reader to parse input from | |
// stores current and previously parsed subtitle | |
// and a results channel for returning parsed subtitles | |
type lexer struct { | |
*bufio.Scanner | |
lineNumber int | |
prev, s Subtitle | |
eof bool | |
results chan Subtitle | |
done chan struct{} | |
} | |
// Subtitles consist of an index, a start and end time, and a | |
// list of strings containing the lines of the subtitle | |
type Subtitle struct { | |
Idx int | |
Start, Stop time.Duration | |
Lines []string | |
} | |
func (s Subtitle) ApplyLines(f func(string) string) { | |
for i, _ := range s.Lines { | |
s.Lines[i] = f(s.Lines[i]) | |
} | |
} | |
// func (s Subtitle) String() string { | |
// r := fmt.Sprintf("%d\r\n", s.Idx) | |
// r += fmt.Sprintf("%s --> %s\r\n", FormatDuration(s.Start), FormatDuration(s.Stop)) | |
// for _, line := range s.Lines { | |
// r += line + "\r\n" | |
// } | |
// r += "\r\n" | |
// return r | |
// } | |
func FormatDuration(d time.Duration) (r string) { | |
r = fmt.Sprintf("%02d:", d/time.Hour) | |
r += fmt.Sprintf("%02d:", (d%time.Hour)/time.Minute) | |
r += fmt.Sprintf("%02d,", (d%time.Minute)/time.Second) | |
r += fmt.Sprintf("%03d", (d%time.Second)/time.Millisecond) | |
return | |
} | |
// Parse and return a time.Duration | |
func ParseTime(r string) (time.Duration, error) { | |
var hour, minute, second, milli int | |
_, err := fmt.Sscanf(r, "%d:%d:%d,%d", &hour, &minute, &second, &milli) | |
d := time.Duration(hour) * time.Hour | |
d += time.Duration(minute) * time.Minute | |
d += time.Duration(second) * time.Second | |
d += time.Duration(milli) * time.Millisecond | |
return d, err | |
} | |
// Parse a subtitle index | |
func Idx(l *lexer) stateFn { | |
var err error | |
if err = l.Err(); err != nil { | |
fmt.Println("Error reading index:%d: %s\n", l.lineNumber, err) | |
return Exit | |
} | |
_, err = fmt.Sscanf(strings.TrimSpace(l.Text()), "%d", &l.s.Idx) | |
// Unknown error parsing index | |
if err != nil { | |
fmt.Printf("Error parsing index:%d: %s\n", l.lineNumber, err) | |
return Exit | |
} | |
// Indexes must be monotonically increasing | |
if l.s.Idx <= l.prev.Idx { | |
fmt.Printf("Invalid index, expected:%d: %d < %d\n", l.lineNumber, l.prev.Idx, l.s.Idx) | |
return Exit | |
} | |
// Parse the start time next | |
return TimeRange | |
} | |
// Parse the subtitle's time range | |
func TimeRange(l *lexer) stateFn { | |
var err error | |
if err = l.Err(); err != nil { | |
fmt.Println("Error reading time range:%d: %s\n", l.lineNumber, err) | |
return Exit | |
} | |
line := strings.TrimSpace(l.Text()) | |
// Split on the time divider | |
times := strings.Split(line, " --> ") | |
// Should have exactly two fields after splitting | |
if len(times) != 2 { | |
fmt.Printf("Error splitting time range:%d: %q\n", l.lineNumber, line) | |
return Exit | |
} | |
// Parse the start time, exit on any error | |
l.s.Start, err = ParseTime(times[0]) | |
if err != nil { | |
fmt.Printf("Error parsing start time:%d:%s\n", l.lineNumber, err) | |
return Exit | |
} | |
// Parse the end time, exit on any error | |
l.s.Stop, err = ParseTime(times[1]) | |
if err != nil { | |
fmt.Printf("Error parsing stop time:%d:%s\n", l.lineNumber, err) | |
return Exit | |
} | |
// Handle any invalid time ranges | |
if l.s.Start >= l.s.Stop { | |
fmt.Printf("Invalid time range, expected:%d: %s < %s\n", l.lineNumber, l.s.Start, l.s.Stop) | |
return Exit | |
} | |
if l.s.Start <= l.prev.Stop { | |
fmt.Printf("Time range overlaps previous subtitle, expected:%d: %s < %s\n", l.lineNumber, l.prev.Stop, l.s.Start) | |
return Exit | |
} | |
return Line | |
} | |
// Parse subtitle lines | |
func Line(l *lexer) (s stateFn) { | |
var err error | |
if err = l.Err(); err != nil { | |
fmt.Println("Error reading line:%d: %s\n", l.lineNumber, err) | |
return Exit | |
} | |
line := strings.TrimSpace(l.Text()) | |
// If the line isn't empty after trimming, append | |
// it to the subtitle and parse another line | |
if len(line) > 0 && !l.eof { | |
l.s.Lines = append(l.s.Lines, line) | |
return Line | |
} | |
// We haven't exited yet so line must be empty, send the | |
// current subtitle on the result channel. Make a new | |
// subtitle and store the current as the previous | |
l.results <- l.s | |
l.prev, l.s = l.s, Subtitle{} | |
if l.eof { | |
return Exit | |
} | |
// Parse a new subtitle starting with it's index | |
return Idx | |
} | |
// Close the result channel, signal we're done and exit the state machine by returning nil | |
func Exit(l *lexer) stateFn { | |
close(l.results) | |
l.done <- struct{}{} | |
return nil | |
} | |
func main() { | |
// Open subrip file | |
subFile, err := os.Open(SUB_FILENAME) | |
if err != nil { | |
fmt.Println("Error opening subtitle:", err) | |
os.Exit(1) | |
} | |
defer subFile.Close() | |
var lexer lexer | |
lexer.Scanner = bufio.NewScanner(subFile) | |
lexer.results = make(chan Subtitle, 1) | |
lexer.done = make(chan struct{}) | |
// Start the result consumer | |
wg := new(sync.WaitGroup) | |
wg.Add(1) | |
go func() { | |
// Regex to remove style tags | |
re := regexp.MustCompilePOSIX("<([^<.]+?)>") | |
for sub := range lexer.results { | |
// Remove all style tags | |
sub.ApplyLines(func(i string) string { | |
return re.ReplaceAllString(i, "") | |
}) | |
fmt.Printf("%+v\n", sub) | |
} | |
}() | |
// Start the subtitle parser | |
go func() { | |
// Parsing begins with an index, continue until | |
// the state machine exits. | |
for state := Idx; state != nil; { | |
lexer.eof = !lexer.Scan() | |
lexer.lineNumber++ | |
state = state(&lexer) | |
} | |
}() | |
// Wait for state machine to exit and parsed subtitles to be printed | |
<-lexer.done | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment