Skip to content

Instantly share code, notes, and snippets.

@masatoi
Last active February 26, 2018 20:43
Show Gist options
  • Select an option

  • Save masatoi/9adff3dd0b7735dea8d2e55c0ec0e011 to your computer and use it in GitHub Desktop.

Select an option

Save masatoi/9adff3dd0b7735dea8d2e55c0ec0e011 to your computer and use it in GitHub Desktop.
State-machine based CSV reader
(defpackage :csv-reader
(:use :cl)
(:nicknames :csv)
(:export #:parse-file #:parse-stream))
(in-package :csv)
(declaim (inline extend-field cell-input cell-finish row-finish parse finish))
(eval-when (:compile-toplevel)
(defparameter *optimize-settings*
'(optimize (speed 3) (safety 0) (debug 0))
;; '(optimize (speed 0) (safety 3) (debug 3))
))
(defmacro defn (function-spec (&rest arg-specs) &body body)
(assert (listp function-spec))
(assert (listp arg-specs))
(dolist (arg-spec arg-specs)
(assert (listp arg-spec))
(assert (= (length arg-spec) 2)))
`(progn
(declaim (ftype (function ,(mapcar #'cadr arg-specs) ,(cadr function-spec)) ,(car function-spec)))
(defun ,(car function-spec) ,(mapcar #'car arg-specs)
(declare ,*optimize-settings*
,@(mapcar (lambda (arg arg-type)
(list 'type arg-type arg))
(mapcar #'car arg-specs)
(mapcar #'cadr arg-specs)))
,@body)))
(defmacro tlet (bindings &body body)
(assert (listp bindings))
(dolist (binding bindings)
(assert (listp binding))
(assert (= (length binding) 3)))
`(let (,@(mapcar (lambda (binding)
(subseq binding 0 2))
bindings))
(declare ,@(mapcar (lambda (binding)
(list 'type (caddr binding) (car binding)))
bindings))
,@body))
(defstruct (state (:constructor %make-state))
(buffer "" :type (simple-array character))
(field "" :type (simple-array character))
(offset 0 :type fixnum)
(space-count 0 :type fixnum)
row
row-ptr
result
result-ptr)
(defun print-state (state)
(format t "offset: ~A, space-count: ~A, row: ~A, row-ptr: ~A, result: ~A, result-ptr: ~A~%"
(state-offset state) (state-space-count state)
(state-row state) (state-row-ptr state)
(state-result state) (state-result-ptr state)))
(defun make-state (buffer-size field-size)
(%make-state
:buffer (make-string buffer-size :initial-element #\Null)
:field (make-string field-size :initial-element #\Null)))
(defn (cell-input null) ((state state) (char character))
(tlet ((field (state-field state) (simple-array character))
(offset (state-offset state) fixnum))
(when (>= offset (length field)) (error "too large field size"))
(setf (char field offset) char)
(incf (state-offset state))
nil))
(defn (cell-finish null) ((state state))
(tlet ((space-count (state-space-count state) fixnum)
(offset (state-offset state) fixnum)
(field (state-field state) (simple-array character)))
(let* ((offset-without-space (if (> space-count 0) (- offset space-count) offset))
(cell (subseq field 0 offset-without-space)))
(setf (state-offset state) 0
(state-space-count state) 0)
(if (null (state-row state))
(setf (state-row state) (cons cell nil)
(state-row-ptr state) (state-row state))
(setf (cdr (state-row-ptr state)) (cons cell nil)
(state-row-ptr state) (cdr (state-row-ptr state))))
nil)))
(defn (row-finish null) ((state state))
(tlet ((offset (state-offset state) fixnum))
(when (> offset 0)
(cell-finish state))
(if (null (state-result state))
(setf (state-result state) (cons (state-row state) nil)
(state-result-ptr state) (state-result state))
(setf (cdr (state-result-ptr state)) (cons (state-row state) nil)
(state-result-ptr state) (cdr (state-result-ptr state))))
(setf (state-row state) nil)
nil))
(defn (parse state) ((size fixnum) (state state) (separator character) (quote character))
(tlet ((state-tag :START keyword)
(buffer (state-buffer state) (simple-array character)))
(loop for i fixnum from 0 below size
for c character across buffer do
(tagbody
;; (format t "state-tag: ~A~%" state-tag)
;; (print-state state)
(when (and (eq state-tag :QUOTE-ESCAPSE) (char/= c quote))
(setf state-tag :START))
(when (eq state-tag :START)
(cond ((or (char= c #\Space) (char= c #\Tab))
(go :CONTINUE))
((or (char= c #\Newline) (char= c #\Linefeed))
(row-finish state)
(go :CONTINUE))
((char= c separator)
(cell-finish state)
(go :CONTINUE))
((char= c quote)
(setf state-tag :QUOTE)
(go :CONTINUE))
(t
(setf state-tag :NORMAL))))
;; processing field
(case state-tag
(:QUOTE
(cond ((char= c quote)
(cell-finish state)
(setf state-tag :QUOTE-ESCAPSE)
(go :CONTINUE))
(t (cell-input state c))))
(:NORMAL
(cond ((char= c separator)
(cell-finish state)
(setf state-tag :START))
((or (char= c #\Newline) (char= c #\Linefeed))
(row-finish state)
(setf state-tag :START))
((or (char= c #\Space) (char= c #\Tab) (char= c #\Return))
(cell-input state c)
(incf (state-space-count state)))
(t
(cell-input state c)
(setf (state-space-count state) 0))))
(:QUOTE-ESCAPSE
(cond ((char= c quote)
(cell-input state c)
(setf state-tag :QUOTE)))))
:CONTINUE))
state))
(defun finish (state)
(when (> (state-offset state) 0)
(row-finish state))
(state-result state))
(defun parse-stream (stream &key (separator #\,) (quote #\") (buffer-size 8192) (field-size 1024))
(declare (optimize (speed 3) (space 0) (debug 0) (safety 0)))
(declare (type character separator quote))
(declare (type fixnum buffer-size field-size))
(let ((state (make-state buffer-size field-size)))
;; (declare (inline extend-field cell-input cell-finish row-finish parse finish))
(loop for size = (read-sequence (state-buffer state) stream)
until (= size 0)
do (parse size state separator quote)
finally (return (finish state)))))
;; (defparameter kaggle-mnist1-csv
;; "label \"pixel0\" pixel1
;; 1 0 0"
;; )
;; (with-input-from-string (s kaggle-mnist1-csv)
;; (parse-stream s :separator #\ ))
(defun parse-file (file &key (external-format #+clisp system::*default-file-encoding*
#+allegro excl:*default-external-format*
#+sbcl :utf-8)
(separator #\,) (quote #\")
(buffer-size 8192) (field-size 1024))
(if (null (probe-file file))
(error "can't open file")
(with-open-file (stream file :direction :input :external-format external-format
#+clisp :buffered #+clisp t
#+allegro :mapped #+allegro t)
(parse-stream stream :separator separator :quote quote
:buffer-size buffer-size :field-size field-size))))
;; https://github.com/sbussmann/kaggle-mnist
;; (time (defparameter kaggle-mnist1 (parse-file "/home/wiz/datasets/kaggle-mnist/Data/train.csv")))
;; Evaluation took:
;; 1.776 seconds of real time
;; 1.776000 seconds of total run time (1.668000 user, 0.108000 system)
;; [ Run times consist of 0.824 seconds GC time, and 0.952 seconds non-GC time. ]
;; 100.00% CPU
;; 6,023,211,716 processor cycles
;; 1,583,386,544 bytes consed
;; (time (defparameter kaggle-mnist2 (fare-csv:read-csv-file "/home/wiz/datasets/kaggle-mnist/Data/test.csv")))
;; Evaluation took:
;; 8.921 seconds of real time
;; 8.892000 seconds of total run time (8.728000 user, 0.164000 system)
;; [ Run times consist of 1.400 seconds GC time, and 7.492 seconds non-GC time. ]
;; 99.67% CPU
;; 30,261,429,557 processor cycles
;; 2,945,108,736 bytes consed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment