Last active
March 6, 2021 21:06
-
-
Save jackrusher/97734e71bb748ed9c263d6e3daea2b38 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns appliedsciencestudio.covid19-clj-viz.repl | |
(:require [clojure.string :as string] | |
[hickory.core :as hick] | |
[hickory.select :as s])) | |
;;;; Scraping data | |
(def wiki-page | |
"We want this data, but it's only published as HTML." | |
(slurp "https://en.wikipedia.org/wiki/List_of_countries_by_hospital_beds")) | |
(defn deepest-text | |
"Drill down to the deepest text node(s) and return them as a string." | |
[node] | |
(cond (vector? node) (apply str (mapcat deepest-text node)) | |
(map? node) (deepest-text (:content node)) | |
:else node)) | |
(defn extract-tables [html] | |
(mapv (fn [table] | |
(mapv #(mapv deepest-text | |
(s/select (s/or (s/tag :th) (s/tag :td)) %)) | |
(s/select (s/tag :tr) table))) | |
(->> html hick/parse hick/as-hickory (s/select (s/tag :table))))) | |
(drop 3 (first (extract-tables wiki-page))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment