Last active
September 26, 2015 22:07
-
-
Save stassats/1166140 to your computer and use it in GitHub Desktop.
parse radiolab episodes from wikipedia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(eval-when (:compile-toplevel :load-toplevel :execute) | |
(:asd :cxml-stp) | |
(:asd :closure-html)) | |
(defun parse-html (file) | |
(with-open-file (stream file | |
:element-type 'unsigned-byte) | |
(chtml:parse stream (stp:make-builder)))) | |
(defun elementp (x) | |
(typep x 'stp:element)) | |
(defun textp (x) | |
(typep x 'stp:text)) | |
(defun find-span (xml class regex) | |
(let ((regex (ppcre:create-scanner regex))) | |
(stp:find-recursively-if | |
(lambda (child) | |
(and (elementp child) | |
(equal (stp:attribute-value child "class") | |
class) | |
(ppcre:scan regex (stp:string-value child)))) | |
xml))) | |
(defun find-next-sibling (xml class) | |
(loop for sibling = (stp:next-sibling xml) | |
then (stp:next-sibling sibling) | |
when (and (elementp sibling) | |
(equal (stp:attribute-value sibling "class") | |
class)) | |
return sibling)) | |
(defun find-season (i xml) | |
(let ((div (find-span xml "mw-headline" | |
(format nil "^Season ~a \\(" i)))) | |
(find-next-sibling (stp:parent div) "wikitable plainrowheaders"))) | |
(defun find-child (local-name xml) | |
(stp:find-child local-name xml :key #'local-name :test #'equal)) | |
(defun url-or-value (node) | |
(stp:string-value node)) | |
(defun parse-table (xml) | |
(let (result) | |
(stp:map-recursively | |
(lambda (child) | |
(when (elementp child) | |
(let ((symbol (find-symbol (string-upcase (stp:local-name child)) | |
#.*package*))) | |
(case symbol | |
(tr (push '(tr) result)) | |
((td th) | |
(push (cons symbol (url-or-value child)) result)))))) | |
xml) | |
(nreverse result))) | |
(defun group-table (xml) | |
(loop with elements = (parse-table xml) | |
while elements | |
when (loop for (type . value) = (pop elements) | |
while type | |
until (eql type 'tr) | |
collect (case type | |
(td value) | |
(th value))) | |
collect it)) | |
(defun get-season (n xml) | |
(parse-episodes n (group-table (find-season n xml)))) | |
(defun remove-quotes (title) | |
(string-trim '(#\" #\Space #\Newline) title)) | |
(defun parse-episodes (season-number season) | |
(loop for ((number name date) (url)) on (cdr season) by #'cddr | |
collect (list :season season-number | |
:episode-number (parse-integer number) | |
:name (remove-quotes name) | |
:air-date (apply #'tracking::make-date (car (tracking::parse-date date))) | |
;; :url url | |
))) | |
(defun find-episode (season n series) | |
(loop for episode in (episodes series) | |
when (and (equal season (season episode)) | |
(equal n (episode-number episode))) | |
return episode)) | |
(defvar *page* nil) | |
(defun populate-episodes (season) | |
(unless *page* | |
(setf *page* | |
(chtml:parse (drakma:http-request "http://en.wikipedia.org/wiki/List_of_Radiolab_episodes") | |
(stp:make-builder)))) | |
(let ((series (find-objects "Radiolab" 'podcast-series)) | |
(episodes (get-season season *page*))) | |
(loop for episode in episodes | |
unless (find-episode season (getf episode :episode-number) series) | |
do (print (apply #'storage:add 'podcast-episode | |
:series series | |
episode))))) | |
(defun generate-downloads (season &optional (from-episode 0)) | |
(let ((series (find-objects "Radiolab" 'podcast-series))) | |
(loop for episode in (episodes series) | |
for air-date = (air-date episode) | |
when (and (eql (season episode) season) | |
air-date | |
(>= (episode-number episode) from-episode)) | |
do (format t "wget -c http://audio4.wnyc.org/radiolab/radiolab~2,'0d~2,'0d~a.mp3 -O ~2,'0d-~2,'0d-'~a'.mp3~%" | |
(value (month air-date)) | |
(value air-date) | |
(rem (value (year air-date)) 100) | |
(season episode) | |
(episode-number episode) | |
(name episode))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment