Last active
November 15, 2022 20:50
-
-
Save jasdev/f71e0079e8b2ec26baa447f37b0ed0d8 to your computer and use it in GitHub Desktop.
Sketch of an SRT file parser.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Parsing | |
import Foundation | |
// [SubRip file format spec.](https://en.wikipedia.org/wiki/SubRip#File_format) | |
let sampleSRTString = | |
""" | |
1 | |
00:00:00,540 --> 00:00:00,960 | |
Yo-yo | |
2 | |
00:00:00,960 --> 00:00:01,490 | |
yo | |
3 | |
00:00:01,520 --> 00:00:01,830 | |
this | |
4 | |
00:00:01,830 --> 00:00:02,010 | |
is | |
5 | |
00:00:02,010 --> 00:00:02,460 | |
an | |
6 | |
00:00:02,460 --> 00:00:02,760 | |
audio | |
7 | |
00:00:02,760 --> 00:00:03,240 | |
recording | |
""" | |
let timecodeHours = Prefix(2).pipe(Int.parser(isSigned: false)).utf8 | |
let timecodeMinutes = Prefix(2).pipe(Int.parser(isSigned: false)).utf8 | |
let timecodeSeconds = Prefix(2).pipe(Int.parser(isSigned: false)).utf8 | |
let timecodeMilliseconds = Prefix(3).pipe(Int.parser(isSigned: false)).utf8 | |
let timecodeParser = timecodeHours | |
.skip(StartsWith(":".utf8)) | |
.take(timecodeMinutes) | |
.skip(StartsWith(":".utf8)) | |
.take(timecodeSeconds) | |
.skip(StartsWith(",".utf8)) | |
.take(timecodeMilliseconds) | |
.map { hours, minutes, seconds, milliseconds -> TimeInterval in | |
let hoursInSeconds = Double(hours) * 60 * 60 | |
let minutesInSeconds = Double(minutes) * 60 | |
let millisecondsInSeconds = Double(milliseconds) * 1 / 1_000 | |
return hoursInSeconds + minutesInSeconds + Double(seconds) + millisecondsInSeconds | |
} | |
let timecodeLineParser = timecodeParser | |
.skip(StartsWith(" --> ".utf8)) | |
.take(timecodeParser) | |
.filter(<) // `start`- and `endTimeCode`s must strictly increase within a group. | |
struct SubtitleGroup { | |
var sequenceNumber: Int | |
var startTimecode: TimeInterval | |
var endTimecode: TimeInterval | |
var substring: String | |
} | |
let srtGroupParser = Int.parser(isSigned: false) | |
.skip(Newline()) | |
.take(timecodeLineParser) | |
.skip(Newline()) | |
.take( | |
PrefixUpTo("\n\n") | |
.orElse(PrefixUpTo("\r\n\r\n")) | |
.orElse(Rest()) | |
.utf8 | |
) | |
.map { sequenceNumber, timecodes, substring in | |
SubtitleGroup( | |
sequenceNumber: sequenceNumber, | |
startTimecode: timecodes.0, | |
endTimecode: timecodes.1, | |
substring: String(substring) | |
) | |
} | |
let srtParser = Many(srtGroupParser, separator: Newline().skip(Newline())) | |
.skip(End()) | |
.flatMap { groups in | |
groups.sequenceNumbersAndTimecodesAreInIncreasingOrder() ? | |
Conditional.first(Always(groups)) : | |
.second(Fail()) | |
} | |
private extension Collection where Element == SubtitleGroup { | |
func sequenceNumbersAndTimecodesAreInIncreasingOrder () -> Bool { | |
guard first?.sequenceNumber == 1 else { return false } | |
return zip(self, self.dropFirst()) | |
.allSatisfy { first, second in | |
first.sequenceNumber + 1 == second.sequenceNumber && | |
first.endTimecode <= second.startTimecode // Adjacent groups must have _non-decreasing_ timecodes. | |
} | |
} | |
} | |
dump(srtParser.parse(sampleSRTString.utf8)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment