Created
August 25, 2013 02:59
-
-
Save tmiller/6331719 to your computer and use it in GitHub Desktop.
Hand written CSV tokenizer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Tokenizer | |
class Buffer # :nodoc: | |
def initialize io | |
@io = io | |
@buffer = [] | |
end | |
def peek len = 1 | |
(len - @buffer.length).times { | |
@buffer << @io.getc | |
} | |
@buffer.first(len).join | |
end | |
def getc | |
if @buffer.empty? | |
@io.getc | |
else | |
@buffer.shift | |
end | |
end | |
def read len | |
len.times.map { getc }.join | |
end | |
def ungetc c | |
@buffer << c | |
end | |
end | |
def initialize(io) | |
@buf = Buffer.new io | |
end | |
def next_token | |
c = @buf.getc | |
case c | |
when '"' then [:VALUE, scan_quoted_value] | |
when /[^,\r\n]/ | |
@buf.ungetc(c) | |
[:VALUE, scan_value] | |
else | |
[c,c] | |
end | |
end | |
def scan_value | |
val = '' | |
while @buf.peek =~ /[^,\n\r]/ | |
val << @buf.getc | |
end | |
"#{val}" | |
end | |
def scan_quoted_value | |
val = '' | |
loop do | |
c = @buf.getc | |
case c | |
when /[^"]/ then val << c | |
when '"' | |
case @buf.peek | |
when '"' | |
val << @buf.getc | |
else | |
break | |
end | |
else | |
raise "wtf" | |
end | |
end | |
"\"#{val}\"" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment