Last active
December 20, 2015 21:59
-
-
Save sabof/6201962 to your computer and use it in GitHub Desktop.
es-scrape-web-listing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;; -*- lexical-binding: t -*- | |
(require 'cl-lib) | |
(defun es-scrape-web-listing | |
(start-url collect-function next-url-function done-function | |
&optional pages-limit silent) | |
"Retrieve a list of things from a multi-page web document. | |
START-URL is the location from which to start scraping. | |
COLLECT-FUNCTION should return a list of collected things. | |
NEXT-URL-FUNCTION should be a function that will return the | |
link to the next page, or nil, when on the last page. | |
DONE-FUNCTION will be called once processing is finished, with | |
one argument - the results list. | |
One can limit the number of retrieved pages, by setting PAGES-LIMIT to a number. | |
When SILENT is non-nil, no progress messages will be displayed. | |
The function returns a function that will stop the scraping process." | |
(cl-assert (and start-url collect-function next-url-function done-function)) | |
(let (retrieve-recursively | |
collected | |
next-url | |
( visited-urls (list start-url))) | |
(setq retrieve-recursively | |
(lambda (&rest args) | |
(goto-char (point-min)) | |
(setq collected | |
(nconc collected | |
(funcall collect-function))) | |
(goto-char (point-min)) | |
(unless silent | |
(message (format "Scraped \"%s\". Collected so far: %s" | |
next-url | |
(length collected)))) | |
(cond ( (and pages-limit (<= (cl-decf pages-limit) 0)) | |
(funcall done-function collected)) | |
( (and (setq next-url (funcall next-url-function)) | |
(not (member next-url visited-urls))) | |
(push next-url visited-urls) | |
(url-retrieve next-url retrieve-recursively)) | |
( t (funcall done-function collected))) | |
(kill-buffer))) | |
(url-retrieve start-url retrieve-recursively)) | |
(lambda () | |
(setq next-url-function (lambda ())))) | |
;; Example: nczonline.net | |
(defvar ncz-posts nil) | |
(defvar ncz-scraper-stop nil) | |
(setq ncz-scraper-stop | |
(es-scrape-web-listing | |
"http://www.nczonline.net/" | |
(lambda () | |
(cl-loop with link-holder | |
with name-holder | |
while (and (search-forward "class=\"post-snippet" nil t) | |
(re-search-forward "href=\"\\(?1:.+?\\)\"" nil t) | |
(setq link-holder (match-string 1)) | |
(re-search-forward ">\\(?1:.+?\\)<" nil t) | |
(setq name-holder (match-string 1))) | |
collecting (cons link-holder name-holder) | |
)) | |
(lambda () | |
(ignore-errors | |
(search-forward "<div class=\"navigation\">") | |
(re-search-forward "<a href=\"\\(?1:.+?\\)\" >") | |
(match-string 1))) | |
(lambda (result) | |
(setq scrape-result result) | |
(message (concat "Done. " | |
(number-to-string | |
(length scrape-result)) | |
" items found."))))) | |
(defun es-alist-to-org-links (alist) | |
(cl-dolist (pair alist) | |
(insert "- [ ] [[" (car pair) "][" (cdr pair) "]]\n"))) | |
;; To convert the retrieved list to org format | |
;; (es-alist-to-org-links ncz-posts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment