Created
November 18, 2010 15:16
-
-
Save youz/705102 to your computer and use it in GitHub Desktop.
htmlからDOMツリーっぽいリストを作成 (要 www-mode) #xyzzy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;; -*- mode:lisp; package:dom -*- | |
(eval-when (:compile-toplevel :load-toplevel :execute) | |
(require "xml-http-request") | |
(require "www/www")) | |
(defpackage dom | |
(:use :lisp :editor)) | |
(in-package "dom") | |
(export '(get-doc-from-url | |
get-doc-from-buffer | |
get-doc-from-file | |
tag attr contents | |
dumptree | |
find-node | |
find-node-by-tag | |
find-node-by-class) | |
) | |
(defvar *single-tags* '(:meta :link :img :input :br :hr :param :embed)) | |
(defmacro accum (accfn &body body) | |
`(let (#1=#:gacc) | |
(flet ((,accfn (#2=#:ga) (push #2# #1#))) | |
,@body) | |
(nreverse #1#))) | |
(defmacro whenlet (var test &body body) | |
`(let ((,var ,test)) (when ,var ,@body))) | |
(defun make-tree (parsed) | |
(labels | |
((kw (sym) (intern (string-downcase (symbol-name sym)) "keyword")) | |
(singletagp (elm) (or (find (kw (car elm)) *single-tags*) | |
(string-match "/$" (symbol-name (car elm))))) | |
(rec (tag) | |
(list* (kw (car tag)) | |
(mapcar #1=#'(lambda (attr) (cons (kw (car attr)) (cdr attr))) | |
(cadr tag)) | |
(accum a | |
(loop for elm = (pop parsed) do | |
(cond ((null elm) | |
(a `(:unmatched-open-tag ((:tag ,(kw (car tag)))))) | |
;(warn "unmatched open tag: <~A>~%" (car tag)) | |
(loop-finish)) | |
((stringp elm) | |
(when (string-match "[^ \t\n]" elm) (a elm))) | |
((consp elm) | |
(if (cadr (assoc 'close (cadr elm))) | |
(if (eq (car tag) (car elm)) | |
(loop-finish) | |
(a `(:unmatched-close-tag ((:tag ,(kw (car elm))))))) | |
;(warn "unmatched close tag: </~A>~%" (car elm))) | |
(a (if (singletagp elm) | |
`(,(kw (car elm)) ,(mapcar #1# (cadr elm))) | |
(rec elm))))) | |
(t (warn "~S~%" elm)))))))) | |
(when (setf parsed (member 'www::html parsed :key #'safe-car)) | |
(rec (pop parsed))))) | |
(defun parse-html-string (html) | |
(save-excursion | |
(let ((buf (create-new-buffer "*temp*"))) | |
(set-buffer buf) | |
(insert html) | |
(www::www-delete-comment) | |
(make-local-variable 'www::www-charset) | |
(setq www::www-charset (www::www-get-encode html)) | |
(prog1 | |
(make-tree (nreverse (www::www-parse-html))) | |
(delete-buffer buf))))) | |
(defun get-doc-from-url (url) | |
(let ((html (xhr:xhr-get url :key #'xhr:xhr-response-text :since :epoch))) | |
(when html | |
(parse-html-string html)))) | |
(defun get-doc-from-buffer (buf) | |
(save-excursion | |
(set-buffer buf) | |
(parse-html-string (buffer-substring (point-min) (point-max))))) | |
(defun get-doc-from-file (path) | |
(save-excursion | |
(let ((buf (create-new-buffer "*html*"))) | |
(set-buffer buf) | |
(setq need-not-save t) | |
(insert-file path) | |
(prog1 (get-doc-from-buffer buf) | |
(kill-buffer buf))))) | |
(defun dumptree (tree &optional (stream t) (indent 0)) | |
(format t "~&~VT" (* indent 2)) | |
(cond ((atom tree) (format stream "~S" tree)) | |
((listp tree) | |
(format stream "(~A (~{(~{~A ~S~})~^ ~})" (car tree) (cadr tree)) | |
(if (cddr tree) | |
(progn | |
(format stream "~%") | |
(dolist (child (cddr tree)) | |
(dumptree child stream (1+ indent))))) | |
(format stream ")")) | |
(t (format stream "~S" tree)))) | |
(defmacro tag (obj) | |
`(car ,obj)) | |
(defmacro attr (obj key) | |
`(cadr (assoc ,key (cadr ,obj)))) | |
(defmacro contents (obj) | |
`(cddr ,obj)) | |
(defun tagp (obj) | |
(and (listp obj) (keywordp (tag obj)))) | |
(defun find-node (tree pred &key (recursive t)) | |
(accum a | |
(labels | |
((rec (obj) | |
(cond ((tagp obj) | |
(when (funcall pred obj) | |
(a obj)) | |
(whenlet elms (and recursive (contents obj)) | |
(mapc #'rec elms))) | |
((listp obj) (mapc #'rec obj)) | |
((atom obj) nil)))) | |
(rec tree)))) | |
(defun find-node-by-tag (tree tag) | |
(find-node tree | |
(lambda (obj) (eql (car obj) tag)))) | |
(defun find-node-by-class (tree pattern) | |
(find-node | |
tree | |
(lambda (obj) | |
(whenlet class (attr obj :class) | |
(string-match pattern class))))) | |
(provide "dom") | |
; (let ((moss-menu (get-html "http://www.mos.co.jp/menu/all/"))) | |
; (find-node-by-class moss-menu "^heightLine-1")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment