Last active
February 26, 2018 20:43
-
-
Save masatoi/9adff3dd0b7735dea8d2e55c0ec0e011 to your computer and use it in GitHub Desktop.
State-machine based CSV reader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| (defpackage :csv-reader | |
| (:use :cl) | |
| (:nicknames :csv) | |
| (:export #:parse-file #:parse-stream)) | |
| (in-package :csv) | |
| (declaim (inline extend-field cell-input cell-finish row-finish parse finish)) | |
| (eval-when (:compile-toplevel) | |
| (defparameter *optimize-settings* | |
| '(optimize (speed 3) (safety 0) (debug 0)) | |
| ;; '(optimize (speed 0) (safety 3) (debug 3)) | |
| )) | |
| (defmacro defn (function-spec (&rest arg-specs) &body body) | |
| (assert (listp function-spec)) | |
| (assert (listp arg-specs)) | |
| (dolist (arg-spec arg-specs) | |
| (assert (listp arg-spec)) | |
| (assert (= (length arg-spec) 2))) | |
| `(progn | |
| (declaim (ftype (function ,(mapcar #'cadr arg-specs) ,(cadr function-spec)) ,(car function-spec))) | |
| (defun ,(car function-spec) ,(mapcar #'car arg-specs) | |
| (declare ,*optimize-settings* | |
| ,@(mapcar (lambda (arg arg-type) | |
| (list 'type arg-type arg)) | |
| (mapcar #'car arg-specs) | |
| (mapcar #'cadr arg-specs))) | |
| ,@body))) | |
| (defmacro tlet (bindings &body body) | |
| (assert (listp bindings)) | |
| (dolist (binding bindings) | |
| (assert (listp binding)) | |
| (assert (= (length binding) 3))) | |
| `(let (,@(mapcar (lambda (binding) | |
| (subseq binding 0 2)) | |
| bindings)) | |
| (declare ,@(mapcar (lambda (binding) | |
| (list 'type (caddr binding) (car binding))) | |
| bindings)) | |
| ,@body)) | |
| (defstruct (state (:constructor %make-state)) | |
| (buffer "" :type (simple-array character)) | |
| (field "" :type (simple-array character)) | |
| (offset 0 :type fixnum) | |
| (space-count 0 :type fixnum) | |
| row | |
| row-ptr | |
| result | |
| result-ptr) | |
| (defun print-state (state) | |
| (format t "offset: ~A, space-count: ~A, row: ~A, row-ptr: ~A, result: ~A, result-ptr: ~A~%" | |
| (state-offset state) (state-space-count state) | |
| (state-row state) (state-row-ptr state) | |
| (state-result state) (state-result-ptr state))) | |
| (defun make-state (buffer-size field-size) | |
| (%make-state | |
| :buffer (make-string buffer-size :initial-element #\Null) | |
| :field (make-string field-size :initial-element #\Null))) | |
| (defn (cell-input null) ((state state) (char character)) | |
| (tlet ((field (state-field state) (simple-array character)) | |
| (offset (state-offset state) fixnum)) | |
| (when (>= offset (length field)) (error "too large field size")) | |
| (setf (char field offset) char) | |
| (incf (state-offset state)) | |
| nil)) | |
| (defn (cell-finish null) ((state state)) | |
| (tlet ((space-count (state-space-count state) fixnum) | |
| (offset (state-offset state) fixnum) | |
| (field (state-field state) (simple-array character))) | |
| (let* ((offset-without-space (if (> space-count 0) (- offset space-count) offset)) | |
| (cell (subseq field 0 offset-without-space))) | |
| (setf (state-offset state) 0 | |
| (state-space-count state) 0) | |
| (if (null (state-row state)) | |
| (setf (state-row state) (cons cell nil) | |
| (state-row-ptr state) (state-row state)) | |
| (setf (cdr (state-row-ptr state)) (cons cell nil) | |
| (state-row-ptr state) (cdr (state-row-ptr state)))) | |
| nil))) | |
| (defn (row-finish null) ((state state)) | |
| (tlet ((offset (state-offset state) fixnum)) | |
| (when (> offset 0) | |
| (cell-finish state)) | |
| (if (null (state-result state)) | |
| (setf (state-result state) (cons (state-row state) nil) | |
| (state-result-ptr state) (state-result state)) | |
| (setf (cdr (state-result-ptr state)) (cons (state-row state) nil) | |
| (state-result-ptr state) (cdr (state-result-ptr state)))) | |
| (setf (state-row state) nil) | |
| nil)) | |
| (defn (parse state) ((size fixnum) (state state) (separator character) (quote character)) | |
| (tlet ((state-tag :START keyword) | |
| (buffer (state-buffer state) (simple-array character))) | |
| (loop for i fixnum from 0 below size | |
| for c character across buffer do | |
| (tagbody | |
| ;; (format t "state-tag: ~A~%" state-tag) | |
| ;; (print-state state) | |
| (when (and (eq state-tag :QUOTE-ESCAPSE) (char/= c quote)) | |
| (setf state-tag :START)) | |
| (when (eq state-tag :START) | |
| (cond ((or (char= c #\Space) (char= c #\Tab)) | |
| (go :CONTINUE)) | |
| ((or (char= c #\Newline) (char= c #\Linefeed)) | |
| (row-finish state) | |
| (go :CONTINUE)) | |
| ((char= c separator) | |
| (cell-finish state) | |
| (go :CONTINUE)) | |
| ((char= c quote) | |
| (setf state-tag :QUOTE) | |
| (go :CONTINUE)) | |
| (t | |
| (setf state-tag :NORMAL)))) | |
| ;; processing field | |
| (case state-tag | |
| (:QUOTE | |
| (cond ((char= c quote) | |
| (cell-finish state) | |
| (setf state-tag :QUOTE-ESCAPSE) | |
| (go :CONTINUE)) | |
| (t (cell-input state c)))) | |
| (:NORMAL | |
| (cond ((char= c separator) | |
| (cell-finish state) | |
| (setf state-tag :START)) | |
| ((or (char= c #\Newline) (char= c #\Linefeed)) | |
| (row-finish state) | |
| (setf state-tag :START)) | |
| ((or (char= c #\Space) (char= c #\Tab) (char= c #\Return)) | |
| (cell-input state c) | |
| (incf (state-space-count state))) | |
| (t | |
| (cell-input state c) | |
| (setf (state-space-count state) 0)))) | |
| (:QUOTE-ESCAPSE | |
| (cond ((char= c quote) | |
| (cell-input state c) | |
| (setf state-tag :QUOTE))))) | |
| :CONTINUE)) | |
| state)) | |
| (defun finish (state) | |
| (when (> (state-offset state) 0) | |
| (row-finish state)) | |
| (state-result state)) | |
| (defun parse-stream (stream &key (separator #\,) (quote #\") (buffer-size 8192) (field-size 1024)) | |
| (declare (optimize (speed 3) (space 0) (debug 0) (safety 0))) | |
| (declare (type character separator quote)) | |
| (declare (type fixnum buffer-size field-size)) | |
| (let ((state (make-state buffer-size field-size))) | |
| ;; (declare (inline extend-field cell-input cell-finish row-finish parse finish)) | |
| (loop for size = (read-sequence (state-buffer state) stream) | |
| until (= size 0) | |
| do (parse size state separator quote) | |
| finally (return (finish state))))) | |
| ;; (defparameter kaggle-mnist1-csv | |
| ;; "label \"pixel0\" pixel1 | |
| ;; 1 0 0" | |
| ;; ) | |
| ;; (with-input-from-string (s kaggle-mnist1-csv) | |
| ;; (parse-stream s :separator #\ )) | |
| (defun parse-file (file &key (external-format #+clisp system::*default-file-encoding* | |
| #+allegro excl:*default-external-format* | |
| #+sbcl :utf-8) | |
| (separator #\,) (quote #\") | |
| (buffer-size 8192) (field-size 1024)) | |
| (if (null (probe-file file)) | |
| (error "can't open file") | |
| (with-open-file (stream file :direction :input :external-format external-format | |
| #+clisp :buffered #+clisp t | |
| #+allegro :mapped #+allegro t) | |
| (parse-stream stream :separator separator :quote quote | |
| :buffer-size buffer-size :field-size field-size)))) | |
| ;; https://github.com/sbussmann/kaggle-mnist | |
| ;; (time (defparameter kaggle-mnist1 (parse-file "/home/wiz/datasets/kaggle-mnist/Data/train.csv"))) | |
| ;; Evaluation took: | |
| ;; 1.776 seconds of real time | |
| ;; 1.776000 seconds of total run time (1.668000 user, 0.108000 system) | |
| ;; [ Run times consist of 0.824 seconds GC time, and 0.952 seconds non-GC time. ] | |
| ;; 100.00% CPU | |
| ;; 6,023,211,716 processor cycles | |
| ;; 1,583,386,544 bytes consed | |
| ;; (time (defparameter kaggle-mnist2 (fare-csv:read-csv-file "/home/wiz/datasets/kaggle-mnist/Data/test.csv"))) | |
| ;; Evaluation took: | |
| ;; 8.921 seconds of real time | |
| ;; 8.892000 seconds of total run time (8.728000 user, 0.164000 system) | |
| ;; [ Run times consist of 1.400 seconds GC time, and 7.492 seconds non-GC time. ] | |
| ;; 99.67% CPU | |
| ;; 30,261,429,557 processor cycles | |
| ;; 2,945,108,736 bytes consed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment