Created
July 24, 2016 08:01
-
-
Save ssrihari/08399ab4d3858a7cf3e06ada89a309b2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns wiki-crawler.core | |
(:require [clojure.string :as s]) | |
(:import [org.jsoup Jsoup] | |
[org.jsoup.nodes Document Element])) | |
(def base-url | |
"https://en.wikipedia.org") | |
(defn fetch [page-link] | |
(.get (Jsoup/connect (str base-url page-link)))) | |
;; type hint | |
(defn text [^Element x] | |
(.text x)) | |
(defn title [^Document doc] | |
(text (first (.select doc "h1")))) | |
(defn last-modified [^Document doc] | |
(let [lm-text (text (first (.select doc "#footer-info-lastmod")))] | |
(last (s/split lm-text #"last modified on ")))) | |
(defn first-paragraph [^Document doc] | |
(first (.select doc "#mw-content-text p"))) | |
(defn first-paragraph-text [^Document doc] | |
(text (first-paragraph doc))) | |
(defn links-to-follow [^Document doc] | |
(->> (.select (first-paragraph doc) "a") | |
(map #(.attr % "href")) | |
(filter #(.startsWith % "/wiki")))) | |
(defn table-of-contents [^Document doc] | |
(map text (.select doc "#toc li .toctext"))) | |
(defn write [{:keys [title] :as page}] | |
(let [filename (s/replace title #"\s" "")] | |
(spit (str "pages/" filename) page))) | |
(defn info [url] | |
(let [doc (fetch url)] | |
{:title (title doc) | |
:last-modified (last-modified doc) | |
:first-paragraph (first-paragraph-text doc) | |
:table-of-contents (table-of-contents doc) | |
:links-to-follow (links-to-follow doc)})) | |
(defn scrape [url] | |
(write (info url))) | |
(comment | |
(scrape "/wiki/Clojure")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment