Skip to content

Instantly share code, notes, and snippets.

@ssrihari
Created July 24, 2016 08:01
Show Gist options
  • Save ssrihari/08399ab4d3858a7cf3e06ada89a309b2 to your computer and use it in GitHub Desktop.
Save ssrihari/08399ab4d3858a7cf3e06ada89a309b2 to your computer and use it in GitHub Desktop.
(ns wiki-crawler.core
(:require [clojure.string :as s])
(:import [org.jsoup Jsoup]
[org.jsoup.nodes Document Element]))
(def base-url
"https://en.wikipedia.org")
(defn fetch [page-link]
(.get (Jsoup/connect (str base-url page-link))))
;; type hint
(defn text [^Element x]
(.text x))
(defn title [^Document doc]
(text (first (.select doc "h1"))))
(defn last-modified [^Document doc]
(let [lm-text (text (first (.select doc "#footer-info-lastmod")))]
(last (s/split lm-text #"last modified on "))))
(defn first-paragraph [^Document doc]
(first (.select doc "#mw-content-text p")))
(defn first-paragraph-text [^Document doc]
(text (first-paragraph doc)))
(defn links-to-follow [^Document doc]
(->> (.select (first-paragraph doc) "a")
(map #(.attr % "href"))
(filter #(.startsWith % "/wiki"))))
(defn table-of-contents [^Document doc]
(map text (.select doc "#toc li .toctext")))
(defn write [{:keys [title] :as page}]
(let [filename (s/replace title #"\s" "")]
(spit (str "pages/" filename) page)))
(defn info [url]
(let [doc (fetch url)]
{:title (title doc)
:last-modified (last-modified doc)
:first-paragraph (first-paragraph-text doc)
:table-of-contents (table-of-contents doc)
:links-to-follow (links-to-follow doc)}))
(defn scrape [url]
(write (info url)))
(comment
(scrape "/wiki/Clojure"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment