Last active
November 8, 2017 04:47
-
-
Save html/5283629 to your computer and use it in GitHub Desktop.
Utilities I use for parsing, it depends on "php funcs for cl" https://gist.github.com/html/4707958
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(defvar *cache-dir* '(:relative "cache")) | |
(defvar *cache-enabled-p* nil) | |
(defvar *drakma-request-max-tries* nil) ; | |
(defvar *character-for-wrong-utf-8-chars* #\@) | |
; V5 | |
(defun drakma-request (&rest args) | |
"Gets url contents using drakma:http-request. | |
Tries several times to do it on error. | |
When *drakma-request-max-tries* is NIL tries forever | |
Otherwise tries *drakma-request-max-tries* times | |
Replaces wrong utf-8 characters with *character-for-wrong-utf-8-chars* upon encoding error | |
" | |
; posible errors 'drakma::drakma-simple-error | |
; usocket:timeout-error | |
(let ((result) | |
(attempt 0)) | |
(loop | |
while (or (not *drakma-request-max-tries*) | |
(< attempt *drakma-request-max-tries*)) | |
do | |
(incf attempt) | |
(setf result (ignore-errors | |
(multiple-value-list | |
(handler-bind ((flexi-streams:external-format-encoding-error | |
#'(lambda (c) | |
(use-value *character-for-wrong-utf-8-chars*)))) | |
(apply #'drakma:http-request args))))) | |
(if result | |
(return-from drakma-request (apply #'values result)) | |
(progn | |
(format t "Failed to get result from ~A, trying again~%" (car args)) | |
(sleep 3)))))) | |
; V3 | |
(defun get-content-from-url(&rest args) | |
"Gets content from url, caches it in *cache-dir* directory if *cache-enabled-p* is true. Parameters are similar to drakma-request" | |
(when (not *cache-enabled-p*) | |
(return-from get-content-from-url (apply #'drakma-request args))) | |
(let* ((url (car args)) | |
(cached-filename (make-pathname | |
:directory *cache-dir* | |
:name (cl-ppcre:regex-replace-all "^[a-zA-Z0-9]" url "-"))) | |
(content)) | |
(if (probe-file cached-filename) | |
(file-get-contents cached-filename) | |
(progn | |
(setf content (apply #'drakma-request args)) | |
(when content (file-put-contents cached-filename content)) | |
content)))) | |
(defun pretty-print-dom-recursive (object &optional (stream t) (indent-level 0)) | |
(unless (zerop indent-level) | |
(format stream "~%")) | |
(dotimes (i indent-level) | |
(format stream " ")) | |
(cond | |
((consp object) | |
(loop for i in object do | |
(pretty-print-dom-recursive i stream (1+ indent-level)))) | |
((subtypep (type-of object) 'array ) | |
(loop for i across object do | |
(pretty-print-dom-recursive i stream (1+ indent-level)))) | |
((equal (type-of object) 'rune-dom::text) | |
(format stream "~A" (dom:node-value object))) | |
(t (progn | |
(print-unreadable-object (object stream) | |
(princ (dom:tag-name object) stream) | |
(dom:do-node-map (i (dom:attributes object)) | |
(format stream " ~A=\"~A\"" (dom:name i) (dom:value i))) | |
(dom:do-node-list (i (dom:child-nodes object)) | |
(pretty-print-dom-recursive i stream (1+ indent-level)))))))) | |
(defun pretty-print-dom-recursive-to-string (object) | |
(with-output-to-string (s) | |
(pretty-print-dom-recursive object s))) | |
(defun get-node-text (dom) | |
"Recursive function for getting node text" | |
(if (dom:text-node-p dom) | |
(dom:node-value dom) | |
(join | |
(loop for i across (dom:child-nodes dom) collect (get-node-text i))))) | |
(defun get-nodes-text (list) | |
(join (mapcar #'get-node-text list))) | |
(defun replace-utf-8-sequences(string) | |
(cl-ppcre:regex-replace-all | |
"\\\\u[A-Fa-f0-9]{4}" | |
string | |
(lambda (target-string start end match-start match-end &rest args) | |
(string (code-char | |
(parse-integer | |
(subseq target-string (+ match-start 2) match-end) | |
:radix 16)))))) | |
(defun pprint-json (json &optional stream &key (indent-expr #\Tab)) | |
(let ((level 0) | |
(prev-char ) | |
(in-quotes-p) | |
(ends-line-level) | |
(ret)) | |
(setf ret (with-output-to-string (result) | |
(loop for char across json do | |
(let ((new-line-level) | |
(post "")) | |
(when ends-line-level | |
(setf new-line-level ends-line-level) | |
(setf ends-line-level nil)) | |
(cond | |
((and | |
(char= char #\") | |
(or (not prev-char) | |
(not (char= prev-char #\\)))) | |
(setf in-quotes-p (not in-quotes-p))) | |
((not in-quotes-p) | |
(case char | |
((#\} #\]) | |
(decf level) | |
(setf ends-line-level nil) | |
(setf new-line-level level)) | |
((#\{ #\[) | |
(incf level) | |
(setf ends-line-level level)) | |
(#\, (setf ends-line-level level)) | |
(#\: (setf post " ")) | |
((#\Space #\Tab #\Newline #\Return) | |
(setf char nil) | |
(setf ends-line-level new-line-level) | |
(setf new-line-level nil))))) | |
(when new-line-level | |
(format result "~%") | |
(dotimes (i new-line-level) | |
(format result "~a" indent-expr))) | |
(format result "~A~A" char post) | |
(setf prev-char char))))) | |
(if stream | |
(write-string ret stream) | |
ret))) | |
(defun string-trim-spaces (string) | |
; !!IMPORTANT, below are two different spaces | |
(string-trim " " string)) | |
(defun dom-element-empty-p (dom) | |
(zerop (length (string-trim-spaces (get-node-text dom))))) | |
; v2 | |
(defun normalize-string-spaces (str) | |
(ppcre:regex-replace-all "[\\s ]+" str " ")) | |
(defun remove-newlines (string) | |
(ppcre:regex-replace-all "\\n" string "")) | |
(defun transform-to-cells-without-spans (trs) | |
(let* ((cells-length | |
(loop for i across (dom:child-nodes (first trs)) | |
sum (if (dom:has-attribute i "colspan") | |
(parse-integer (dom:get-attribute i "colspan")) | |
1))) | |
(cells-vertical-length (length trs)) | |
(result (make-array (list cells-vertical-length cells-length ) | |
:element-type 'dom:node))) | |
(let ((x 0) | |
(y 0) | |
(row) | |
(cell)) | |
(flet ((maybe-shift-x () | |
(loop while (and (not (= x cells-length)) | |
(not (numberp (aref result y x)))) | |
do | |
(incf x)))) | |
(loop for row in trs | |
do | |
(loop for cell across (dom:child-nodes row) do | |
;(format t "~A ~A ~A~%" x y cell) | |
(unless cell | |
(return)) | |
(cond | |
((and | |
(dom:has-attribute cell "colspan") | |
(dom:has-attribute cell "rowspan")) | |
;(format t "Has rowspan and colspan ~A ~A~%" x y) | |
(loop for i from 1 | |
to (parse-integer (dom:get-attribute cell "colspan")) do | |
(loop for j from 0 | |
to (1- (parse-integer (dom:get-attribute cell "rowspan"))) do | |
(setf (aref result (+ y j) x) cell)) | |
(incf x))) | |
((dom:has-attribute cell "colspan") | |
;(format t "Has colspan ~A ~A~%" x y) | |
(loop for i from 1 to (parse-integer (dom:get-attribute cell "colspan")) do | |
(maybe-shift-x) | |
(or | |
(ignore-errors | |
(setf (aref result y x) cell) | |
(incf x)) | |
(progn | |
(warn "Step out of bounds on rowspan with x=~A y=~A" x y) | |
(return))))) | |
((dom:has-attribute cell "rowspan") | |
(progn | |
;(format t "Has rowspan ~A ~A~%" x y) | |
(maybe-shift-x) | |
(loop for i from 0 to (1- (parse-integer (dom:get-attribute cell "rowspan"))) do | |
(or | |
(ignore-errors (setf (aref result (+ y i) x) cell)) | |
(progn | |
(warn "Step out of bounds on rowspan with x=~A y=~A" x y) | |
(return)))) | |
(incf x))) | |
(t | |
(progn | |
(maybe-shift-x) | |
(or | |
(ignore-errors | |
(setf (aref result y x) cell) | |
(incf x)) | |
(warn "Step out of bounds x=~A y=~A" x y)))))) | |
(incf y) | |
(setf x 0)))) | |
result)) | |
(defun matches-count (re str) | |
(length (ppcre:all-matches-as-strings re str))) | |
(defun render-html-to-string (document) | |
(with-output-to-string (out) | |
(dom:map-document | |
(cxml:make-character-stream-sink | |
out | |
:indentation 2 | |
:canonical nil | |
:omit-xml-declaration-p t) | |
document))) | |
; V2 | |
(defun nodes-list-to-document (nodes) | |
"Converts dom elements to separate document" | |
(let ((document (rune-dom:create-document (first nodes)))) | |
(loop for i in (cdr nodes) do | |
(dom:append-child document (dom:import-node document i t))) | |
document)) | |
; | |
(defun dom-list-to-document (dom-list) | |
(error "Use nodes-list-to-document instead")) | |
(defun write-html (file document) | |
(with-open-file (out file :direction :output :if-does-not-exist :create :if-exists :supersede) | |
(dom:map-document | |
(cxml:make-character-stream-sink | |
out | |
:indentation 2 | |
:canonical nil | |
:omit-xml-declaration-p t) | |
document))) | |
(defun json-single-to-double-quotes (string) | |
(ppcre:regex-replace-all | |
"\\\\'" | |
(ppcre:regex-replace-all | |
"([^\\\\])'" | |
(ppcre:regex-replace-all "\"" string "\\\"") | |
"\\1\"") | |
"'")) | |
(defun parse-string(string) | |
(chtml:parse string (cxml-dom:make-dom-builder))) | |
(defun parse-xml-string (string) | |
(cxml:parse string (cxml-dom:make-dom-builder))) | |
; V2 | |
(defun parse-url(&rest args) | |
"Parses url contents to cxml-dom object. Parameters are similar to drakma:http-request. Uses get-content-from-url instead" | |
(chtml:parse | |
(apply #'get-content-from-url args) | |
(cxml-dom:make-dom-builder))) | |
; V1 | |
(defun parse-url-using-curl (url) | |
(chtml:parse | |
(with-output-to-string (s) | |
(external-program:run "curl" (list url) :output s :wait t)) | |
(cxml-dom:make-dom-builder))) | |
(defun parse-xml-url (&rest args) | |
(cxml:parse | |
(apply #'get-content-from-url args) | |
(cxml-dom:make-dom-builder))) | |
(defun id-equals-selector (id) | |
(lambda (item) | |
(and | |
(dom:has-attribute item "id") | |
(string= (dom:get-attribute item "id") id)))) | |
(defun remove-first-col (trs) | |
(loop for i in trs do | |
(let ((tds (loop for j across (dom:child-nodes i) collect j))) | |
(dom:remove-child | |
(dom:parent-node (first tds)) | |
(first tds)))) | |
trs) | |
(defun remove-last-col (trs) | |
(loop for i in trs do | |
(let ((tds (loop for j across (dom:child-nodes i) collect j))) | |
(dom:remove-child | |
(dom:parent-node (car (last tds))) | |
(car (last tds))))) | |
trs) | |
; V1 | |
(defun string-empty-p (str) | |
(or (not str) | |
(zerop | |
(length | |
(string-trim (list #\Newline #\Space #\Return) str))))) | |
; V2 | |
; Todo: support for ./some-link with page uri parameter | |
(defun get-links-from-html (html &key page-uri) | |
"Extracts links from page. | |
Extracts only links from a elements with href attribute set. | |
Requires html base tag. | |
Skips malformed urls and prints message to standard OUTPUT | |
Uses quri" | |
(let ((base-uri) | |
(doc (parse-string html)) | |
(uri-temp-var)) | |
; Setting base uri | |
(progn | |
(setf base-uri (css:query "base" doc)) | |
(when base-uri | |
(assert (= (length base-uri) 1)) | |
(setf base-uri (quri:uri (dom:get-attribute (car base-uri) "href"))))) | |
(remove-if | |
#'null | |
(loop for i in (css:query "a" doc) | |
if (dom:has-attribute i "href") | |
collect (handler-case | |
(quri:merge-uris (quri:uri (dom:get-attribute i "href")) base-uri) | |
(quri:uri-malformed-string (err) | |
(progn | |
(format t "Skipping wrong url ~A~%" (dom:get-attribute i "href")) | |
nil))))))) | |
; Todo: support for ./some-link with page uri parameter | |
(defun get-links-from-html-2 (html &key page-uri) | |
"Extracts links from page. | |
Extracts only links from a elements with href attribute set. | |
Requires html base tag. | |
Skips malformed urls and prints message to standard OUTPUT | |
Uses puri package for uris" | |
(let ((base-uri) | |
(doc (parse-string html)) | |
(uri-temp-var)) | |
; Setting base uri | |
(progn | |
(setf base-uri (css:query "base" doc)) | |
(when base-uri | |
(assert (= (length base-uri) 1)) | |
(setf base-uri (puri:uri (dom:get-attribute (car base-uri) "href"))))) | |
(remove-if | |
#'null | |
(loop for i in (css:query "a" doc) | |
if (dom:has-attribute i "href") | |
collect (handler-case | |
(puri:merge-uris (puri:uri (dom:get-attribute i "href")) base-uri) | |
(puri:uri-parse-error () | |
(progn | |
(format t "Skipping wrong url ~A~%" (dom:get-attribute i "href")) | |
nil))))))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment