Last active
December 29, 2015 20:59
-
-
Save elfenlaid/7727000 to your computer and use it in GitHub Desktop.
trying to get some fun with clojure
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(ns hcache.core | |
(:require [clj-http.lite.client :as client] | |
[clojure.string :as string] | |
[clojure.core.async :as async :refer [>! <! >!! <!! go chan]]) | |
(:import (java.io File))) | |
(def h {"User-Agent" "Mozilla/5.0 (Windows NT 6.1;) Gecko/20100101 Firefox/13.0.1"}) | |
(defn page [url] | |
(:body (client/get url {:headers h}))) | |
(defn page-imgs [page] | |
(letfn [(strip-url [url] | |
(string/replace url #"['\"()]" ""))] | |
(map strip-url (re-seq #"['\"(][^\s'\"()]+\.(?:png|jpg|jpeg)['\")]" page)))) | |
(defn compose-path | |
([x] (some not-empty [x])) | |
([x y] (if (every? not-empty [x y]) | |
(.. (File. x y) toString) | |
(some not-empty [x y]))) | |
([x y & paths] (reduce compose-path (compose-path x y) paths))) | |
(defn resource-name [img] | |
(last (string/split img #"/"))) | |
(defn move-url-to-dir [dir page url] | |
(if-let [n (resource-name url)] | |
(let [path (compose-path dir n)] | |
(string/replace page url path)) | |
(page))) | |
(defn move-resources-to-dir [dir page urls] | |
(reduce (partial move-url-to-dir dir) page urls)) | |
(defn move-resources-to-root [page urls] | |
(move-resources-to-dir "" page urls)) | |
(defn add-charset-meta [page] | |
(let [meta-re #"<meta\s+charset=" | |
meta-tag "<meta charset=utf-8>"] | |
(if-not (re-find meta-re page) | |
(str meta-tag page) | |
page))) | |
(defn save-http-page [page path] | |
(let [imgs (page-imgs page) | |
s-page (move-resources-to-root page imgs) | |
m-page (add-charset-meta s-page)] | |
(spit path m-page))) | |
(defn save-pages [pages] | |
(let [c (chan)] | |
(doseq [p pages] | |
(go (>! c [p (page p)]))) | |
(loop [i 0] | |
(when (< i (count pages)) | |
(let [[p body] (<!! c) | |
name (str "test-" (last (string/split p #"//")) ".html")] | |
(save-http-page body name)) | |
(recur (inc i)))))) | |
(save-pages ["http://google.com" | |
"http://dev.by" | |
"http://amazon.com"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment