Skip to content

Instantly share code, notes, and snippets.

@MegaLoler
Last active January 16, 2018 09:34
Show Gist options
  • Save MegaLoler/033648176bc93c99d0fabf536e3916be to your computer and use it in GitHub Desktop.
Save MegaLoler/033648176bc93c99d0fabf536e3916be to your computer and use it in GitHub Desktop.
A little script written in Common Lisp to clean up .srt files.
#!/usr/bin/sbcl --script
;; a simple script to clean up .srt files
;; - removes html and other formatting
;; - removes duplicate segments (after removing formatting)
;; - merges consecutive segments with same start time
;;
;; usage: ./subcleaner.lisp < input.srt > output.srt
;;
;; todo:
;; - handle entries with double new lines?
;; - make ending time of merged entries that of the longer entry
(defun every-other (lst)
"Return every other item from a list."
(loop
for i in lst
for keep = t then (not keep)
if keep collect i))
(defun get-indicators (plist)
"Return the indicators from a plist."
(every-other plist))
(defun change-property (plist prop val)
"Return a new plist with prop changed to val."
(loop
for p in (get-indicators plist)
for v = (if (equal prop p) val (getf plist p))
append (list p v)))
(defun split-with-subseq (seq subseq)
"Recursively split a sequence by occurences of a subsequence."
(let ((position (search subseq seq)))
(if position
(cons (subseq seq 0 position)
(split-with-subseq
(subseq seq (+ position (length subseq)))
subseq))
(list seq))))
;; i'd love to do this with regex, but for now...
;; not using regex because idk how to do that in --script mode
(defun strip-html (s)
"Remove all html tags from a string."
(format nil "~{~A~}"
(mapcar (lambda (s)
(car (last (split-with-subseq s ">"))))
(split-with-subseq s "<"))))
(defun read-string (&optional (stream t))
"Read from stream until EOF."
(coerce (loop for char = (read-char stream nil nil)
until (eq char nil)
collect char) 'string))
(defun trim (s &optional (bag '(#\Space)))
"Trim leading and trailing characters from a string."
(string-trim bag s))
(defun parse-srt-time-range (s)
"Return a dotted pair representing a time range in an .srt file."
(let ((parts (mapcar #'trim (split-with-subseq s "-->"))))
(cons (first parts) (second parts))))
(defun format-srt-time-range (time-range)
"Stringify a dotted pair representing a time range in an .srt file."
(format nil "~a --> ~a" (car time-range) (cdr time-range)))
(defun parse-srt-entry (s)
"Return a plist representation of a .srt entry from a string."
(with-input-from-string (stream s)
(list :sequence (read-line stream)
:time-range (parse-srt-time-range (read-line stream))
:content (read-string stream))))
(defun format-srt-entry (entry)
"Stringy a plist representing an entry from a .srt file."
(format nil "~a~%~a~%~a"
(getf entry :sequence)
(format-srt-time-range (getf entry :time-range))
(getf entry :content)))
(defun empty? (s)
"Whether a sequence is empty."
(= 0 (length s)))
(defun read-srt-entries (s)
"Return a list of entries from a string representing a .srt file."
(mapcar #'parse-srt-entry
(remove-if #'empty?
(split-with-subseq s (format nil "~%~%")))))
(defun format-srt-entries (ls)
"Format a list of .srt entries into a string."
(format nil "~{~a~%~%~}" (mapcar #'format-srt-entry ls)))
(defun negate-predicate (predicate)
"Return a predicate function that computes the inverse of a given predicate function."
(lambda (&rest rest) (not (apply predicate rest))))
(defun remove-consecutive-if-not (predicate ls)
"Keep consecutive items in a list according to a predicate."
(cons (car ls)
(loop
for x in (subseq ls 1)
for y in ls
when (funcall predicate x y)
collect x)))
(defun remove-consecutive-if (predicate ls)
(remove-consecutive-if-not (negate-predicate predicate) ls))
;; figure out how to write this better
(defun merge-consecutive-if (predicate ls merge-func)
"Merge consecutive items filtered by predicate using a merge function."
(let ((last (first ls)))
(append
(loop
for x in (subseq ls 1)
for merge = (funcall predicate x last)
unless merge collect last
unless merge do (setq last x)
when merge do (setq last (funcall merge-func last x)))
(last ls))))
(defun starting-time (entry)
"Get the starting time of an .srt entry."
(car (getf entry :time-range)))
(defun share? (func &rest rest)
"Whether the results of func applied to each argument are same."
(apply #'equal (mapcar func rest)))
(defun make-share? (func)
"Make a predicate function that tests whether its arguments share the results of being applied to func."
(lambda (&rest rest)
(apply #'share? (cons func rest))))
(defun make-getter (prop)
"Return a function that gets prop from a plist it is applied to."
(lambda (plist) (getf plist prop)))
(defun duplicate-srt-entry? (x y)
"Whether two .srt entries are to be considered duplicates."
;; compare start times and content
(and (share? #'starting-time x y)
(share? (make-getter :content) x y)))
(defun remove-srt-duplicates (entries)
"Remove consecutive .srt entries that are duplicates."
(remove-consecutive-if #'duplicate-srt-entry? entries))
;; i'd love to do this with regex, but for now...
;; not using regex because idk how to do that in --script mode
(defun strip-srt-formatting (s)
"Remove special .srt formatting."
(format nil "~{~A~}"
(mapcar (lambda (s)
(car (last (split-with-subseq s "}"))))
(split-with-subseq s "{\\"))))
(defun strip-srt-entry (entry)
"Remove html and other formatting info from the content of an .srt entry."
(change-property entry :content
(strip-html
(strip-srt-formatting
(getf entry :content)))))
(defun strip-srt-entries (entries)
"Strip formatting information from .srt entries."
(mapcar #'strip-srt-entry entries))
(defun merge-srt-entry (x y)
"Make a new .srt entry from two .srt entries with merged content."
(change-property x :content (format nil "~a~%~a"
(getf x :content)
(getf y :content))))
(defun merge-srt-entries (entries)
"Merge entries that share the same starting time."
(merge-consecutive-if (make-share? #'starting-time)
entries
#'merge-srt-entry))
(defun sequence-srt-entry (entry sequence)
"Give an .srt entry a given sequence."
(change-property entry :sequence sequence))
(defun fix-srt-sequence (entries)
"Make the sequence value in the .srt entries accurate."
(loop
for x in entries
for y from 1
collect (sequence-srt-entry x y)))
(defun clean-srt-entries (entries)
"Remove duplicates, merge entries that start at the same time, and strip formatting data."
(let* ((cleaned entries)
(cleaned (strip-srt-entries cleaned))
(cleaned (remove-srt-duplicates cleaned))
(cleaned (merge-srt-entries cleaned)))
(fix-srt-sequence cleaned)))
(defun main ()
"Clean .srt from stdin and split out the result to stdout."
(write-string
(format-srt-entries
(clean-srt-entries
(read-srt-entries (read-string))))))
(main)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment