Created
September 4, 2015 07:58
-
-
Save ayato-p/599add5f7549ecc6d9f7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns twilog-scraper.core | |
(:require [net.cgrand.enlive-html :as html] | |
[skyscraper :as s]) | |
(:gen-class)) | |
(defn seed [username] | |
(let [url (str "http://twilog.org/" username "/archives")] | |
[{:username username | |
:url url | |
:processor :archives-page}])) | |
(s/defprocessor archives-page | |
:cache-template "twilog/:username/archives" | |
:process-fn (fn [res context] | |
(let [month-links (html/select res [[:a.side-list-icon (html/attr= :title "日別ツイート一覧")]])] | |
(for [link month-links | |
:let [url (s/href link) | |
month (last (re-find #"monthlist-(:?\d+)" url))]] | |
{:month month | |
:url url | |
:processor :monthlist-page})))) | |
(s/defprocessor monthlist-page | |
:cache-template "twilog/:username/months/:month" | |
:process-fn (fn [res context] | |
(let [day-links (html/select res [:div#content :ul.side-list.wide :a])] | |
(for [link day-links | |
:let [url (s/href link) | |
day (last (re-find #"date-(:?\d+)" url))]] | |
{:day day | |
:url url | |
:processor :day-page})))) | |
(defn- tweet-attr [t selector] | |
(html/text (first (html/select t selector)))) | |
(s/defprocessor day-page | |
:cache-template "twilog/:username/days/:day" | |
:process-fn (fn [res context] | |
(let [tweets (html/select res [:article.tl-tweet])] | |
(for [t tweets | |
:let [text (tweet-attr t [:p.tl-text]) | |
posted-at (tweet-attr t [:p.tl-posted :a])]] | |
{:text text | |
:posted-at posted-at})))) | |
(defn -main [username] | |
(map :text (take 20 (s/scrape (seed username) :processed-cache false)))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment