Skip to content

Instantly share code, notes, and snippets.

@MegaLoler
Last active January 18, 2018 08:44
Show Gist options
  • Save MegaLoler/5bdd687115d3dea37e7d965952d5905e to your computer and use it in GitHub Desktop.
Save MegaLoler/5bdd687115d3dea37e7d965952d5905e to your computer and use it in GitHub Desktop.
Scrape Japanese Memrise course edit page html for items and audio
#!/usr/bin/sbcl --script
;; a simple script to scrape the html of a japanese memrise course edit page
;; change the main function at the very bottom to select output type
;; - main: outputs .tsv
;; - main-2: outputs .html
;; - main-3: outputs list of audio links
;; - main-4: outputs .tsv with audio links formatted for anki (and urls converted to sound file names, so download the audio files and put them in your anki media folder)
;; usage: ./memrise.lisp < memrise.html > output_file
(defun split-with-subseq (seq subseq)
"Recursively split a sequence by occurences of a subsequence."
(let ((position (search subseq seq)))
(if position
(cons (subseq seq 0 position)
(split-with-subseq
(subseq seq (+ position (length subseq)))
subseq))
(list seq))))
(defun read-string (&optional (stream t))
"Read from stream until EOF."
(coerce (loop for char = (read-char stream nil nil)
until (eq char nil)
collect char) 'string))
(defun trim (s &optional (bag '(#\Space)))
"Trim leading and trailing characters from a string."
(string-trim bag s))
(defun format-tsv-entry-anki-audio (entry)
"Format a single .tsv line."
(format nil "~A~C~A~C~A~C~A~C~A~C~{[sound:~A]~^ ~}"
(first entry) #\tab ; kana
(second entry) #\tab ; english
(third entry) #\tab ; common
(fourth entry) #\tab ; kanji
(fifth entry) #\tab ; word type
; audio links
(mapcar (lambda (url)
(car (last (split-with-subseq url "/"))))
(car (last entry)))))
(defun format-tsv-entries-anki-audio (entries)
"Format memrise entries as a .tsv file."
(format nil "#kana~Cenglish~Ccommon~Ckanji~Cword type~Caudio~%~{~a~^~%~}"
#\tab #\tab #\tab #\tab #\tab
(mapcar #'format-tsv-entry-anki-audio entries)))
(defun format-tsv-entry (entry)
"Format a single .tsv line."
(format nil "~A~C~A~C~A~C~A~C~A~C~{~A~^ ~}"
(first entry) #\tab ; kana
(second entry) #\tab ; english
(third entry) #\tab ; common
(fourth entry) #\tab ; kanji
(fifth entry) #\tab ; word type
(car (last entry)))) ; audio links
(defun format-tsv-entries (entries)
"Format memrise entries as a .tsv file."
(format nil "#kana~Cenglish~Ccommon~Ckanji~Cword type~Caudio links~%~{~a~^~%~}"
#\tab #\tab #\tab #\tab #\tab
(mapcar #'format-tsv-entry entries)))
(defun format-html-entry (entry)
"Format a single html table row."
(format nil "<tr><td>~A</td><td>~A</td><td>~A</td><td>~A</td><td>~A</td><td>~{<audio controls src=\"~A\"></audio>~}</td></tr>"
(first entry) ; kana
(second entry) ; english
(third entry) ; common
(fourth entry) ; kanji
(fifth entry) ; word type
(car (last entry)))) ; audio links
(defun format-html-entries (entries)
"Format memrise entries as an html table."
(format nil "<table><tr><th>Kana</th><th>English</th><th>Common</th><th>Kanji</th><th>Type</th><th>Audio</th></tr>~{~a~}</table>" (mapcar #'format-html-entry entries)))
(defun read-memrise-entries (s)
"Parse memrise items from html string."
(mapcar (lambda (s)
(append
(mapcar
(lambda (s)
(trim (first
(split-with-subseq
s
"</div>"))))
(subseq
(split-with-subseq s "<div class=\"text\">")
1))
(list
(mapcar (lambda (s)
(first (split-with-subseq s "\"")))
(subseq
(split-with-subseq s "data-url=\"")
1)))))
(subseq
(split-with-subseq s "<tr class=\"thing\"")
1)))
(defun main ()
"Convert memrise course edit page into .tsv."
(write-string
(format-tsv-entries
(read-memrise-entries (read-string)))))
(defun main-2 ()
"Convert memrise course edit page into an html file."
(write-string
(format-html-entries
(read-memrise-entries (read-string)))))
(defun main-3 ()
"Grab all the audio links and spit them out per line."
(format t "~{~A~^~%~}"
(loop
for entry in (read-memrise-entries (read-string))
append (car (last entry)))))
(defun main-4 ()
"Convert memrise course edit page into .tsv with audio links formatted for anki."
(write-string
(format-tsv-entries-anki-audio
(read-memrise-entries (read-string)))))
(main) ;; outputs .tsv
;(main-2) ;; outputs .html
;(main-3) ;; outputs audio links per line
;(main-4) ;; outputs .tsv with audio formatted for anki (and urls converted to sound file names)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment