Created
August 24, 2013 08:12
-
-
Save shinseitaro/6326819 to your computer and use it in GitHub Desktop.
enlive の 自分メモ
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; enlive の tutorial がふるくなってるので自分でやるよ | |
(ns tutorial.scrape1 | |
(:require [net.cgrand.enlive-html :as html]) | |
(:import [java.net URL])) | |
(def base-url "https://news.ycombinator.com/") | |
;; html の中身をとってきます | |
(defn fetch-url [url] | |
(html/html-resource (URL. url))) | |
;; 各ニュースのタイトルとリンクを取得します。 | |
;; ソースを見ると | |
;; <td class="title"><a href="http://www.microsoft.com/en-us/news/press/2013/aug13/08-23AnnouncementPR.aspx">Microsoft CEO Steve Ballmer to retire within 12 months</a> | |
;; 見たいになっているので、TableDataのtitleクラスのaタグだけとればいいので、こんな感じ | |
(html/select (fetch-url base-url) [:td.title :a]) | |
;; こんな感じのマップで返ってきます | |
;({:tag :a, :attrs {:href "http://www.microsoft.com/en-us/news/press/2013/aug13/08-23AnnouncementPR.aspx"}, :content ("Microsoft CEO Steve Ballmer to retire within 12 months")} {:tag :a, :attrs {:href "http://www.slate.com/blogs/future_tense/2013/08/23/stack_ranking_steve_ballmer_s_employee_evaluation_system_and_microsoft_s.html"}, :content ("The Poisonous Employee-Ranking System That Helps Explain Microsoft’s Decline")} {:tag :a, :attrs {:href "http://www.ebay.com/itm/Prototype-Hardware-from-Lockheed-Martin-Surveillance-Project-/221272094476?"}, :content ("Prototype Hardware from Lockheed Martin Surveillance Project")}) | |
;; マップのテキスト部分だけを取得するには text 関数を使います | |
(map html/text (html/select (fetch-url base-url) [:td.title :a])) | |
;; ("Microsoft CEO Steve Ballmer to retire within 12 months" "The Poisonous Employee-Ranking System That Helps Explain Microsoft’s Decline" "Make No Promises" "Prototype Hardware from Lockheed Martin Surveillance Project" "Google and the NSA: Who’s holding the ‘shit-bag’ now?" ...) | |
;; 今度は、タイトルの下にあるサブテキスト”990 points by tomorgan 18 hours ago | 643 comments”のところを取得します | |
;; source はこんな感じ | |
;; <td class="subtext"><span id=score_6263205>990 points</span> by <a href="user?id=tomorgan">tomorgan</a> 18 hours ago | <a href="item?id=6263205">643 comments</a></td> なのでまずは | |
(html/select (fetch-url base-url) [:td.subtext]) | |
;; これがかえすのが、 | |
;; こういう情報 | |
;; ({:tag :td, :attrs {:class "subtext"}, | |
;; :content ({:tag :span, :attrs {:id "score_6263205"}, :content ("994 points")} " by " | |
;; {:tag :a, :attrs {:href "user?id=tomorgan"}, :content ("tomorgan")} " 18 hours ago | " | |
;; {:tag :a, :attrs {:href "item?id=6263205"}, :content ("646 comments")})} | |
;; | |
;; {:tag :td, :attrs {:class "subtext"}, | |
;; :content ({:tag :span, :attrs {:id "score_6266863"}, :content ("136 points")} " by " | |
;; {:tag :a, :attrs {:href "user?id=probabilistic"}, :content ("probabilistic")} " 7 hours ago | " | |
;; {:tag :a, :attrs {:href "item?id=6266863"}, :content ("106 comments")})} | |
;; {:tag :td, :attrs {:class "subtext"}, | |
;; :content ({:tag :span, :attrs {:id "score_6267439"}, :content ("40 points")} " by " | |
;; {:tag :a, :attrs {:href "user?id=_halgari"}, :content ("_halgari")} " 4 hours ago | " | |
;; {:tag :a, :attrs {:href "item?id=6267439"}, :content ("24 comments")})} | |
;; ....... ) | |
;; この :content の1番めの要素だけとりたい場合はhtml/first-childが使えます | |
(html/select (fetch-url base-url) [:td.subtext html/first-child]) | |
;; ちなみに、first-childは、(def first-child (nth-child 1))というdefなので、2つめ以降をとりたい場合は、これでOK | |
(html/select (fetch-url base-url) [:td.subtext (html/nth-child 2)]) | |
;; (def last-child (nth-last-child 1))が定義されているので、最後をとりたい場合は | |
(html/select (fetch-url base-url) [:td.subtext html/last-child]) | |
;; テキストだけとりたい場合は | |
(map html/text (html/select (fetch-url base-url) [:td.subtext html/first-child])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment