Created
June 12, 2014 08:49
-
-
Save leeonix/3820ae3a43f74845e6f0 to your computer and use it in GitHub Desktop.
parse csv file use state machine
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- vi: syntax=lua ts=4 sw=4 et: | |
-- | |
-- FILE: csv.lua | |
-- AUTHOR: LeeoNix | |
-- DESCRIPTION: parse csv file use state machine | |
-- NOTES: --- | |
-- | |
local C_INV = 1 -- invalid characters | |
local C_COMMA = 2 -- , | |
local C_DQUOT = 3 -- " | |
local C_LF = 4 -- \n | |
local C_CR = 5 -- \r | |
local C_ETC = 6 -- the rest | |
local C_MAX = 7 | |
-- | |
-- This array maps the first 96 ASCII characters into character classes | |
-- The remaining characters should be mapped to C_ETC | |
-- | |
local ascii_class = { | |
C_INV, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, | |
C_ETC, C_ETC, C_LF, C_ETC, C_ETC, C_CR, C_ETC, C_ETC, | |
C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, | |
C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, | |
C_ETC, C_ETC, C_DQUOT, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, | |
C_ETC, C_ETC, C_ETC, C_ETC, C_COMMA, C_ETC, C_ETC, C_ETC, | |
C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, | |
C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, C_ETC, | |
}; | |
local S_NLF = 1 | |
local S_NCR = 2 | |
local S_NCL = 3 | |
local S_NST = 4 | |
local S_STR = 5 | |
local S_QST = 6 | |
local S_QT1 = 7 | |
local S_QT2 = 8 | |
local S_MAX = 9 | |
local S_ERR = S_MAX | |
local state_table = { | |
--[[ inv , " \n \r etc ]] | |
--[[ S_NLF ]] { S_ERR, S_NST, S_QT1, S_NLF, S_NCR, S_STR }, | |
--[[ S_NCR ]] { S_ERR, S_NST, S_QT1, S_NCL, S_NCR, S_STR }, | |
--[[ S_NCL ]] { S_ERR, S_NST, S_QT1, S_NLF, S_NCR, S_STR }, | |
--[[ S_NST ]] { S_ERR, S_NST, S_QT1, S_NLF, S_NCR, S_STR }, | |
--[[ S_STR ]] { S_ERR, S_NST, S_ERR, S_NLF, S_NCR, S_STR }, | |
--[[ S_QST ]] { S_ERR, S_QST, S_QT2, S_QST, S_QST, S_QST }, | |
--[[ S_QT1 ]] { S_ERR, S_QST, S_QT2, S_QST, S_QST, S_QST }, | |
--[[ S_QT2 ]] { S_ERR, S_NST, S_QST, S_NLF, S_NCR, S_ERR } | |
}; | |
local csv = { | |
state = S_NLF, | |
cache = '', | |
cache_row = nil, | |
result = {}, | |
} | |
function csv:clear() | |
self.state = S_NLF | |
self.cache = '' | |
self.cache_row = nil | |
self.result = {} | |
end -- end function | |
function csv:add(str) | |
local row | |
local state = self.state | |
if state == S_STR or state == S_QST then | |
str = self.cache .. str | |
row = self.cache_row | |
else | |
row = {} | |
end -- end if | |
local first = 1 | |
local last = 1 | |
for i = 1, #str do | |
local c = str:byte(i) | |
local asc | |
if c < 64 then | |
asc = ascii_class[c + 1] | |
else | |
asc = C_ETC | |
end -- end if else | |
state = state_table[state][asc] | |
if state == S_STR or state == S_QST then | |
last = i | |
elseif state == S_NST then | |
table.insert(row, str:sub(first, last)) | |
first = i + 1 | |
elseif state == S_NLF or state == S_NCR then | |
table.insert(row, str:sub(first, last)) | |
first = i + 1 | |
table.insert(self.result, row) | |
row = {} | |
elseif state == S_QT1 then | |
first = first + 1 | |
elseif state == S_ERR then | |
return false, "Parse Error" | |
end -- end if else | |
end -- end for | |
if state == S_STR or state == S_QST then | |
self.cache = str:sub(first) | |
self.cache_row = row | |
elseif #row ~= 0 then | |
table.insert(self.result, row) | |
end -- end if | |
self.state = state | |
return true | |
end -- end function | |
function csv:finish() | |
local state = self.state | |
if state < S_NLF and state > S_NST then | |
return nil, "Finish Error" | |
end -- end if | |
self.state = S_ERR; | |
return self.result | |
end -- end function | |
function csv:parse_string(s) | |
local last = s:byte(#s) | |
if last ~= 10 and last ~= 13 then | |
s = s .. '\n' | |
end -- end | |
local r, err = self:add(s) | |
if not r then | |
return r, err | |
end -- end if | |
return self:finish() | |
end -- end function | |
function csv:parse_file(name) | |
local f = assert(io.open(name, "r")) | |
local s = f:read('*a') | |
assert(io.close(f)) | |
return self:parse_string(s) | |
end -- end function | |
function csv:dump(t, write, nl) | |
t = t or self.result | |
write = write or io.write | |
nl = nl or '\n' | |
for _, row in ipairs(t) do | |
write(table.concat(row, ',')) | |
write(nl) | |
end -- end for | |
end -- end function | |
local r, err = csv:parse_file(arg[1]) | |
csv:dump() | |
return csv | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment