Last active
August 15, 2018 13:57
-
-
Save sunng87/4432937 to your computer and use it in GitHub Desktop.
ClojureScript to download pages from Disney Wiki. Target to nodejs.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns crawler.core | |
(:require [cljs.nodejs :as node])) | |
(def http (node/require "http")) | |
(def fs (node/require "fs")) | |
(def local-path-root "out/") | |
(.mkdir fs local-path-root (fn [e])) | |
(defn start-req [url res-fn] | |
(doto (.get http url res-fn) | |
(.on "error" (fn [e] | |
(println (.-message e)) | |
(start-req url res-fn))) | |
(.end))) | |
(declare ^:dynamic *fd*) | |
(declare cb-fn) | |
(defn start-category-req* [category from-page finish-cb fd] | |
(let [url (format "http://disney.wikia.com/api.php?format=json&action=query&list=categorymembers&cmtitle=Category:%s" category) | |
url (if-not (nil? from-page) | |
(str url (format "&cmcontinue=%s" from-page)) url) | |
buf-list (atom [])] | |
(start-req url | |
(fn [res] | |
(.on res "data" | |
(fn [chunk] | |
(swap! buf-list conj chunk))) | |
(.on res "end" | |
(fn [] | |
(let [all-text (apply str @buf-list) | |
json-data (.parse js/JSON all-text) | |
data (js->clj json-data)] | |
(binding [*fd* fd] (finish-cb data category))))))))) | |
(defn cb-fn [data category] | |
(let [pages (-> data (get "query") (get "categorymembers")) | |
next-page (-> data (get "query-continue") | |
(get "categorymembers") | |
(get "cmcontinue"))] | |
(doseq [p pages] | |
(let [data-line (js/Buffer. (str (get p "pageid") | |
"\t" | |
(get p "title") | |
"\n"))] | |
(.writeSync fs *fd* data-line 0 (.-length data-line)))) | |
(if-not (or (nil? next-page) (not (zero? (.indexOf next-page "page")))) | |
(do | |
(println (str "do fetching next page: " next-page)) | |
(start-category-req* category next-page cb-fn *fd*)) | |
(.close fs *fd*)))) | |
(defn start-category-req [category] | |
(.open fs (str local-path-root category ".cat") "w" | |
(fn [err fd] | |
(binding [*fd* fd] | |
(start-category-req* category nil cb-fn *fd*))))) | |
(defn start-page-req [category page-id page-name dcb] | |
(println (str "Downloading " page-name)) | |
(let [url (format "http://disney.wikia.com/wiki/?action=render&curid=%s" page-id)] | |
(.open fs (str local-path-root category "/" (.replace page-name "/" "")) "w" | |
(fn [err local-fd] | |
(start-req url | |
(fn [res] | |
(.on res "data" | |
(fn [chunk] | |
(let [b (js/Buffer. chunk) | |
bl (.-length b)] | |
(.write fs local-fd b 0 bl)))) | |
(.on res "end" | |
(fn [] | |
(.close fs local-fd) | |
(println (str "Downloaded " page-name)) | |
(dcb))))))))) | |
(defn dcb [category page-dir] | |
(when-not (empty? page-dir) | |
(let [[page-id page-name] (first page-dir)] | |
(start-page-req category page-id page-name (fn [] (dcb category (rest page-dir))))))) | |
(defn download-category-pages [category] | |
(let [page-dir-content (.readFileSync fs (str local-path-root category ".cat") "UTF-8") | |
page-dir (drop-last (map #(into [] (.split % "\t")) | |
(into [] (.split page-dir-content "\n"))))] | |
(dcb category page-dir))) | |
(defn starter [] | |
(println "hello world") | |
#_(start-category-req "Disney_characters") | |
#_(start-category-req "Disney_franchises") | |
#_(start-category-req "Films") | |
#_(start-category-req "Television_series_by_Disney") | |
(download-category-pages "Disney_characters")) | |
(set! *main-cli-fn* starter) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment