Last active
January 16, 2018 09:34
-
-
Save MegaLoler/033648176bc93c99d0fabf536e3916be to your computer and use it in GitHub Desktop.
A little script written in Common Lisp to clean up .srt files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/sbcl --script | |
;; a simple script to clean up .srt files | |
;; - removes html and other formatting | |
;; - removes duplicate segments (after removing formatting) | |
;; - merges consecutive segments with same start time | |
;; | |
;; usage: ./subcleaner.lisp < input.srt > output.srt | |
;; | |
;; todo: | |
;; - handle entries with double new lines? | |
;; - make ending time of merged entries that of the longer entry | |
(defun every-other (lst) | |
"Return every other item from a list." | |
(loop | |
for i in lst | |
for keep = t then (not keep) | |
if keep collect i)) | |
(defun get-indicators (plist) | |
"Return the indicators from a plist." | |
(every-other plist)) | |
(defun change-property (plist prop val) | |
"Return a new plist with prop changed to val." | |
(loop | |
for p in (get-indicators plist) | |
for v = (if (equal prop p) val (getf plist p)) | |
append (list p v))) | |
(defun split-with-subseq (seq subseq) | |
"Recursively split a sequence by occurences of a subsequence." | |
(let ((position (search subseq seq))) | |
(if position | |
(cons (subseq seq 0 position) | |
(split-with-subseq | |
(subseq seq (+ position (length subseq))) | |
subseq)) | |
(list seq)))) | |
;; i'd love to do this with regex, but for now... | |
;; not using regex because idk how to do that in --script mode | |
(defun strip-html (s) | |
"Remove all html tags from a string." | |
(format nil "~{~A~}" | |
(mapcar (lambda (s) | |
(car (last (split-with-subseq s ">")))) | |
(split-with-subseq s "<")))) | |
(defun read-string (&optional (stream t)) | |
"Read from stream until EOF." | |
(coerce (loop for char = (read-char stream nil nil) | |
until (eq char nil) | |
collect char) 'string)) | |
(defun trim (s &optional (bag '(#\Space))) | |
"Trim leading and trailing characters from a string." | |
(string-trim bag s)) | |
(defun parse-srt-time-range (s) | |
"Return a dotted pair representing a time range in an .srt file." | |
(let ((parts (mapcar #'trim (split-with-subseq s "-->")))) | |
(cons (first parts) (second parts)))) | |
(defun format-srt-time-range (time-range) | |
"Stringify a dotted pair representing a time range in an .srt file." | |
(format nil "~a --> ~a" (car time-range) (cdr time-range))) | |
(defun parse-srt-entry (s) | |
"Return a plist representation of a .srt entry from a string." | |
(with-input-from-string (stream s) | |
(list :sequence (read-line stream) | |
:time-range (parse-srt-time-range (read-line stream)) | |
:content (read-string stream)))) | |
(defun format-srt-entry (entry) | |
"Stringy a plist representing an entry from a .srt file." | |
(format nil "~a~%~a~%~a" | |
(getf entry :sequence) | |
(format-srt-time-range (getf entry :time-range)) | |
(getf entry :content))) | |
(defun empty? (s) | |
"Whether a sequence is empty." | |
(= 0 (length s))) | |
(defun read-srt-entries (s) | |
"Return a list of entries from a string representing a .srt file." | |
(mapcar #'parse-srt-entry | |
(remove-if #'empty? | |
(split-with-subseq s (format nil "~%~%"))))) | |
(defun format-srt-entries (ls) | |
"Format a list of .srt entries into a string." | |
(format nil "~{~a~%~%~}" (mapcar #'format-srt-entry ls))) | |
(defun negate-predicate (predicate) | |
"Return a predicate function that computes the inverse of a given predicate function." | |
(lambda (&rest rest) (not (apply predicate rest)))) | |
(defun remove-consecutive-if-not (predicate ls) | |
"Keep consecutive items in a list according to a predicate." | |
(cons (car ls) | |
(loop | |
for x in (subseq ls 1) | |
for y in ls | |
when (funcall predicate x y) | |
collect x))) | |
(defun remove-consecutive-if (predicate ls) | |
(remove-consecutive-if-not (negate-predicate predicate) ls)) | |
;; figure out how to write this better | |
(defun merge-consecutive-if (predicate ls merge-func) | |
"Merge consecutive items filtered by predicate using a merge function." | |
(let ((last (first ls))) | |
(append | |
(loop | |
for x in (subseq ls 1) | |
for merge = (funcall predicate x last) | |
unless merge collect last | |
unless merge do (setq last x) | |
when merge do (setq last (funcall merge-func last x))) | |
(last ls)))) | |
(defun starting-time (entry) | |
"Get the starting time of an .srt entry." | |
(car (getf entry :time-range))) | |
(defun share? (func &rest rest) | |
"Whether the results of func applied to each argument are same." | |
(apply #'equal (mapcar func rest))) | |
(defun make-share? (func) | |
"Make a predicate function that tests whether its arguments share the results of being applied to func." | |
(lambda (&rest rest) | |
(apply #'share? (cons func rest)))) | |
(defun make-getter (prop) | |
"Return a function that gets prop from a plist it is applied to." | |
(lambda (plist) (getf plist prop))) | |
(defun duplicate-srt-entry? (x y) | |
"Whether two .srt entries are to be considered duplicates." | |
;; compare start times and content | |
(and (share? #'starting-time x y) | |
(share? (make-getter :content) x y))) | |
(defun remove-srt-duplicates (entries) | |
"Remove consecutive .srt entries that are duplicates." | |
(remove-consecutive-if #'duplicate-srt-entry? entries)) | |
;; i'd love to do this with regex, but for now... | |
;; not using regex because idk how to do that in --script mode | |
(defun strip-srt-formatting (s) | |
"Remove special .srt formatting." | |
(format nil "~{~A~}" | |
(mapcar (lambda (s) | |
(car (last (split-with-subseq s "}")))) | |
(split-with-subseq s "{\\")))) | |
(defun strip-srt-entry (entry) | |
"Remove html and other formatting info from the content of an .srt entry." | |
(change-property entry :content | |
(strip-html | |
(strip-srt-formatting | |
(getf entry :content))))) | |
(defun strip-srt-entries (entries) | |
"Strip formatting information from .srt entries." | |
(mapcar #'strip-srt-entry entries)) | |
(defun merge-srt-entry (x y) | |
"Make a new .srt entry from two .srt entries with merged content." | |
(change-property x :content (format nil "~a~%~a" | |
(getf x :content) | |
(getf y :content)))) | |
(defun merge-srt-entries (entries) | |
"Merge entries that share the same starting time." | |
(merge-consecutive-if (make-share? #'starting-time) | |
entries | |
#'merge-srt-entry)) | |
(defun sequence-srt-entry (entry sequence) | |
"Give an .srt entry a given sequence." | |
(change-property entry :sequence sequence)) | |
(defun fix-srt-sequence (entries) | |
"Make the sequence value in the .srt entries accurate." | |
(loop | |
for x in entries | |
for y from 1 | |
collect (sequence-srt-entry x y))) | |
(defun clean-srt-entries (entries) | |
"Remove duplicates, merge entries that start at the same time, and strip formatting data." | |
(let* ((cleaned entries) | |
(cleaned (strip-srt-entries cleaned)) | |
(cleaned (remove-srt-duplicates cleaned)) | |
(cleaned (merge-srt-entries cleaned))) | |
(fix-srt-sequence cleaned))) | |
(defun main () | |
"Clean .srt from stdin and split out the result to stdout." | |
(write-string | |
(format-srt-entries | |
(clean-srt-entries | |
(read-srt-entries (read-string)))))) | |
(main) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment