shriphani · January 25, 2016 08:00
diff --git a/crawl-sp-blog-xpath.clj b/crawl-sp-blog-xpath.clj
 ;;  (:require [clj-xpath.core :refer :all]
 ;;            [net.cgrand.enlive-html :as html]
 ;;            [org.bovinegenius.exploding-fish :as uri]
 ;;            [pegasus.core :refer [crawl]])

 (defn crawl-sp-blog-xpaths
  []
    (crawl {:seeds ["http://blog.shriphani.com/feeds/all.rss.xml"]
            :user-agent "Pegasus web crawler"
            :extractor
            (fn [obj]
              ;; ensure that we only extract in domain
              (when (= "blog.shriphani.com"
                     (-> obj :url uri/host))
                
                (let [url (:url obj)
                      resource (try (-> obj
                                        :body
                                        xml->doc)
                                    (catch Exception e nil))

                      ;; extract the articles
                      articles (map
                                :text
                                (try ($x "//item/link" resource)
                                     (catch Exception e nil)))]
                  
                  ;; add extracted links to the supplied object
                  (merge obj
                         {:extracted articles}))))
          
          :corpus-size 20 ;; crawl 20 documents
          :job-dir "/tmp/sp-blog-corpus"}))
	;; (:require [clj-xpath.core :refer :all]
	;; [net.cgrand.enlive-html :as html]
	;; [org.bovinegenius.exploding-fish :as uri]
	;; [pegasus.core :refer [crawl]])

	(defn crawl-sp-blog-xpaths
	[]
	(crawl {:seeds ["http://blog.shriphani.com/feeds/all.rss.xml"]
	:user-agent "Pegasus web crawler"
	:extractor
	(fn [obj]
	;; ensure that we only extract in domain
	(when (= "blog.shriphani.com"
	(-> obj :url uri/host))

	(let [url (:url obj)
	resource (try (-> obj
	:body
	xml->doc)
	(catch Exception e nil))

	;; extract the articles
	articles (map
	:text
	(try ($x "//item/link" resource)
	(catch Exception e nil)))]

	;; add extracted links to the supplied object
	(merge obj
	{:extracted articles}))))

	:corpus-size 20 ;; crawl 20 documents
	:job-dir "/tmp/sp-blog-corpus"}))
No results found