Created
October 18, 2011 11:35
-
-
Save paulkoerbitz/1295222 to your computer and use it in GitHub Desktop.
Simple webscrape with enlive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns tutorial.scrape1 | |
(:require [net.cgrand.enlive-html :as html])) | |
(def *url* "http://www.belex.rs/trgovanje/prospekt/VZAS/show") | |
(defn get-page | |
"Gets the html page from passed url" | |
[url] | |
(html/html-resource (java.net.URL. url))) | |
(defn content->string [content] | |
(cond | |
(nil? content) "" | |
(string? content) content | |
(map? content) (content->string (:content content)) | |
(coll? content) (apply str (map content->string content)) | |
:else (str content))) | |
(derive clojure.lang.PersistentStructMap ::Map) | |
(derive clojure.lang.PersistentArrayMap ::Map) | |
(derive java.lang.String ::String) | |
(derive clojure.lang.ISeq ::Collection) | |
(derive clojure.lang.PersistentList ::Collection) | |
(derive clojure.lang.LazySeq ::Collection) | |
(defn tag-type [node] | |
(case (:tag node) | |
:tr ::CompoundNode | |
:table ::CompoundNode | |
:th ::TerminalNode | |
:td ::TerminalNode | |
:h3 ::TerminalNode | |
:tbody ::IgnoreNode | |
::IgnoreNode)) | |
(defmulti parse-node | |
(fn [node] | |
(let [cls (class node)] [cls (if (isa? cls ::Map) (tag-type node) nil)]))) | |
(defmethod parse-node [::Map ::TerminalNode] [node] | |
(content->string (:content node))) | |
(defmethod parse-node [::Map ::CompoundNode] [node] | |
(map parse-node (:content node))) | |
(defmethod parse-node [::Map ::IgnoreNode] [node] | |
(parse-node (:content node))) | |
(defmethod parse-node [::String nil] [node] | |
node) | |
(defmethod parse-node [::Collection nil] [node] | |
(map parse-node node)) | |
(defn h3+table | |
"returns sequence of <h3> and <table> tags" | |
[url] | |
(let [ws-content (get-page url) | |
h3s+tables (html/select ws-content #{[:div#prospekt_container :h3] | |
[:div#prospekt_container :table]})] | |
(for [node h3s+tables] (parse-node node)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment