Skip to content

Instantly share code, notes, and snippets.

@lsgrep
Last active August 29, 2015 14:11
Show Gist options
  • Select an option

  • Save lsgrep/cb7d0dd54f5d5f22b5bb to your computer and use it in GitHub Desktop.

Select an option

Save lsgrep/cb7d0dd54f5d5f22b5bb to your computer and use it in GitHub Desktop.
Google scholar crawler
;; scraping google scholar
(def ^:dynamic scholar-base "http://scholar.google.com/scholar?q=")
(def start-url "http://scholar.google.com/scholar?q=machine++learning")
(def typical-url "http://scholar.google.com/scholar?q=machine++learning&start=10")
(defn rebuild-keywords [keyword]
(.replace (.trim keyword) " " "+"))
(defn next-page-url
[url]
(if (.contains url "start")
(let [old-value (Integer. (re-find #"^[0-9]+" (last (.split url "=")))) ]
(str/replace url (str old-value) (str (+ 10 old-value))))
(str url "&start=10")))
(next-page-url start-url)
(take 5 (iterate next-page-url start-url ))
(def test-keyword "machine learning")
;; this is used for building url
(defn build-query-url
([ keyword]
(build-query-url keyword 1))
([keyword page]
(if (= page 1)
(str scholar-base (rebuild-keywords keyword))
(str scholar-base (rebuild-keywords keyword) "&start=" (* page 10))
)))
(build-query-url test-keyword)
(rebuild-keywords test-keyword)
;; get page data
(defn get-page [query-url] (html/select (html/html-resource (html/html-snippet (:body (http/get query-url)) )) [:div.gs_r]))
;; test get page
(def sample-page-res (get-page (build-query-url test-keyword)))
(defn select-title [res](html/text (first (html/select res [:h3.gs_rt :a]))))
(defn select-pdf-url [res] (get-in (first (html/select res [:div.gs_ttss :a])) [:attrs :href]))
(defn get-data-from-page [page]
(map (fn [res] {:title (select-title res) :pdf (select-pdf-url res)}) page))
(get-data-from-page (get-page (build-query-url test-keyword 2)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment