Last active
August 29, 2015 14:11
-
-
Save lsgrep/cb7d0dd54f5d5f22b5bb to your computer and use it in GitHub Desktop.
Google scholar crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ;; scraping google scholar | |
| (def ^:dynamic scholar-base "http://scholar.google.com/scholar?q=") | |
| (def start-url "http://scholar.google.com/scholar?q=machine++learning") | |
| (def typical-url "http://scholar.google.com/scholar?q=machine++learning&start=10") | |
| (defn rebuild-keywords [keyword] | |
| (.replace (.trim keyword) " " "+")) | |
| (defn next-page-url | |
| [url] | |
| (if (.contains url "start") | |
| (let [old-value (Integer. (re-find #"^[0-9]+" (last (.split url "=")))) ] | |
| (str/replace url (str old-value) (str (+ 10 old-value)))) | |
| (str url "&start=10"))) | |
| (next-page-url start-url) | |
| (take 5 (iterate next-page-url start-url )) | |
| (def test-keyword "machine learning") | |
| ;; this is used for building url | |
| (defn build-query-url | |
| ([ keyword] | |
| (build-query-url keyword 1)) | |
| ([keyword page] | |
| (if (= page 1) | |
| (str scholar-base (rebuild-keywords keyword)) | |
| (str scholar-base (rebuild-keywords keyword) "&start=" (* page 10)) | |
| ))) | |
| (build-query-url test-keyword) | |
| (rebuild-keywords test-keyword) | |
| ;; get page data | |
| (defn get-page [query-url] (html/select (html/html-resource (html/html-snippet (:body (http/get query-url)) )) [:div.gs_r])) | |
| ;; test get page | |
| (def sample-page-res (get-page (build-query-url test-keyword))) | |
| (defn select-title [res](html/text (first (html/select res [:h3.gs_rt :a])))) | |
| (defn select-pdf-url [res] (get-in (first (html/select res [:div.gs_ttss :a])) [:attrs :href])) | |
| (defn get-data-from-page [page] | |
| (map (fn [res] {:title (select-title res) :pdf (select-pdf-url res)}) page)) | |
| (get-data-from-page (get-page (build-query-url test-keyword 2))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment