Skip to content

Instantly share code, notes, and snippets.

@stassats
Last active September 26, 2015 22:07
Show Gist options
  • Save stassats/1166140 to your computer and use it in GitHub Desktop.
Save stassats/1166140 to your computer and use it in GitHub Desktop.
parse radiolab episodes from wikipedia
(eval-when (:compile-toplevel :load-toplevel :execute)
(:asd :cxml-stp)
(:asd :closure-html))
(defun parse-html (file)
(with-open-file (stream file
:element-type 'unsigned-byte)
(chtml:parse stream (stp:make-builder))))
(defun elementp (x)
(typep x 'stp:element))
(defun textp (x)
(typep x 'stp:text))
(defun find-span (xml class regex)
(let ((regex (ppcre:create-scanner regex)))
(stp:find-recursively-if
(lambda (child)
(and (elementp child)
(equal (stp:attribute-value child "class")
class)
(ppcre:scan regex (stp:string-value child))))
xml)))
(defun find-next-sibling (xml class)
(loop for sibling = (stp:next-sibling xml)
then (stp:next-sibling sibling)
when (and (elementp sibling)
(equal (stp:attribute-value sibling "class")
class))
return sibling))
(defun find-season (i xml)
(let ((div (find-span xml "mw-headline"
(format nil "^Season ~a \\(" i))))
(find-next-sibling (stp:parent div) "wikitable plainrowheaders")))
(defun find-child (local-name xml)
(stp:find-child local-name xml :key #'local-name :test #'equal))
(defun url-or-value (node)
(stp:string-value node))
(defun parse-table (xml)
(let (result)
(stp:map-recursively
(lambda (child)
(when (elementp child)
(let ((symbol (find-symbol (string-upcase (stp:local-name child))
#.*package*)))
(case symbol
(tr (push '(tr) result))
((td th)
(push (cons symbol (url-or-value child)) result))))))
xml)
(nreverse result)))
(defun group-table (xml)
(loop with elements = (parse-table xml)
while elements
when (loop for (type . value) = (pop elements)
while type
until (eql type 'tr)
collect (case type
(td value)
(th value)))
collect it))
(defun get-season (n xml)
(parse-episodes n (group-table (find-season n xml))))
(defun remove-quotes (title)
(string-trim '(#\" #\Space #\Newline) title))
(defun parse-episodes (season-number season)
(loop for ((number name date) (url)) on (cdr season) by #'cddr
collect (list :season season-number
:episode-number (parse-integer number)
:name (remove-quotes name)
:air-date (apply #'tracking::make-date (car (tracking::parse-date date)))
;; :url url
)))
(defun find-episode (season n series)
(loop for episode in (episodes series)
when (and (equal season (season episode))
(equal n (episode-number episode)))
return episode))
(defvar *page* nil)
(defun populate-episodes (season)
(unless *page*
(setf *page*
(chtml:parse (drakma:http-request "http://en.wikipedia.org/wiki/List_of_Radiolab_episodes")
(stp:make-builder))))
(let ((series (find-objects "Radiolab" 'podcast-series))
(episodes (get-season season *page*)))
(loop for episode in episodes
unless (find-episode season (getf episode :episode-number) series)
do (print (apply #'storage:add 'podcast-episode
:series series
episode)))))
(defun generate-downloads (season &optional (from-episode 0))
(let ((series (find-objects "Radiolab" 'podcast-series)))
(loop for episode in (episodes series)
for air-date = (air-date episode)
when (and (eql (season episode) season)
air-date
(>= (episode-number episode) from-episode))
do (format t "wget -c http://audio4.wnyc.org/radiolab/radiolab~2,'0d~2,'0d~a.mp3 -O ~2,'0d-~2,'0d-'~a'.mp3~%"
(value (month air-date))
(value air-date)
(rem (value (year air-date)) 100)
(season episode)
(episode-number episode)
(name episode)))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment