Last active
July 27, 2019 08:36
-
-
Save basinilya/bd9ccddad879066155a1e61ea18dbd05 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.foo.csvtokenizer; | |
import java.io.Reader; | |
import java.io.StreamTokenizer; | |
import java.util.ArrayList; | |
import java.util.List; | |
public abstract class AbstractPushingCsvParser implements PushingCsvParser { | |
@Override | |
public abstract void onRecord(List<String> values, long lineno) throws Exception; | |
@Override | |
public void parse(final Reader reader) throws Exception { | |
try (Reader sysResource = reader) { | |
tokenizer = new CsvTokenizer(reader, remainderField, quotedFields); | |
for (;;) { | |
switch (tokenizer.nextToken()) { | |
case StreamTokenizer.TT_WORD: | |
mVals.add(tokenizer.sval); | |
break; | |
case StreamTokenizer.TT_EOF: | |
return; | |
case StreamTokenizer.TT_EOL: | |
onRecord(new ArrayList<>(mVals), tokenizer.lineno()); | |
mVals.clear(); | |
break; | |
default: | |
throw new RuntimeException("can't happen"); | |
} | |
} | |
} | |
} | |
private final List<String> mVals = new ArrayList<>(); | |
private CsvTokenizer tokenizer; | |
private int remainderField = Integer.MAX_VALUE; | |
@Override | |
public int getRemainderField() { | |
return remainderField; | |
} | |
@Override | |
public void setRemainderField(final int remainderField) { | |
this.remainderField = remainderField; | |
} | |
private boolean quotedFields = true; | |
@Override | |
public boolean isQuotedFields() { | |
return quotedFields; | |
} | |
@Override | |
public void setQuotedFields(final boolean quotedFields) { | |
this.quotedFields = quotedFields; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.foo.csvtokenizer; | |
import java.io.IOException; | |
import java.io.Reader; | |
import java.io.StreamTokenizer; | |
import java.util.ArrayDeque; | |
import java.util.Deque; | |
public class CsvTokenizer extends StreamTokenizer { | |
public CsvTokenizer(final Reader r, final int remainderField, final boolean quotedFields) { | |
super(r); | |
this.quotedFields = quotedFields; | |
this.remainderField = remainderField; | |
resetSyntax(); | |
wordChars(0, 255); | |
whitespaceChars('\n', '\n'); | |
whitespaceChars('\r', '\r'); | |
if (remainderField > 0) { | |
enableFS(true); | |
} | |
eolIsSignificant(true); | |
} | |
@Override | |
public int nextToken() throws IOException { | |
for (;;) { | |
if (!savedTtypes.isEmpty()) { | |
restore(); | |
} else { | |
super.nextToken(); | |
} | |
switch (ttype) { | |
case TT_WORD: | |
fieldStarted = true; | |
startNextField(sval); | |
break; | |
case TT_EOF: | |
handleEof(); | |
return ttype; | |
case TT_EOL: | |
handleEol(); | |
return ttype; | |
default: | |
handleFs(); | |
if (ttype == TT_WORD) { | |
return ttype; | |
} | |
} | |
} | |
} | |
protected String getFs() { | |
return Character.toString(PushingCsvParser.DEFAULT_FS); | |
} | |
private final int remainderField; | |
private final boolean quotedFields; | |
private boolean fieldStarted; | |
private final Deque<Integer> savedTtypes = new ArrayDeque<>(2); | |
private String savedSval; | |
private int nFields; | |
private int restore() { | |
ttype = savedTtypes.pop(); | |
return ttype; | |
} | |
private void enableFS(final boolean enable) { | |
final String fs = getFs(); | |
for (int i = 0, len = fs.length(); i < len; i++) { | |
final char ch = fs.charAt(i); | |
if (enable) { | |
ordinaryChar(ch); | |
} else { | |
wordChars(ch, ch); | |
} | |
} | |
if (enable && quotedFields) { | |
ordinaryChar('"'); | |
} else { | |
wordChars('"', '"'); | |
} | |
} | |
private void startNextField(final String s) { | |
savedSval = s; | |
nFields++; | |
} | |
private void handleEof() throws IOException { | |
if (nFields > 0) { | |
savedTtypes.push(TT_EOF); | |
handleEol(); | |
} | |
} | |
private void handleEol() throws IOException { | |
if (!fieldStarted) { | |
startNextField(""); | |
fieldStarted = true; | |
} | |
if (nFields > 0) { | |
savedTtypes.push(TT_EOL); | |
ttype = TT_WORD; | |
sval = savedSval; | |
resetLine(); | |
} else { | |
fieldStarted = false; | |
} | |
} | |
private void handleFs() throws IOException { | |
if (getFs().indexOf((char) ttype) == -1) { | |
if (ttype == '"') { | |
if (!fieldStarted) { | |
readQuotedPart(); | |
} | |
recoverUnquotedTail(); | |
fieldStarted = true; | |
} else { | |
throw new RuntimeException("unexpected ttype: " + ttype); | |
} | |
} else { | |
if (!fieldStarted) { | |
startNextField(""); | |
} | |
if (nFields == remainderField) { | |
enableFS(false); | |
} | |
fieldStarted = false; | |
ttype = TT_WORD; | |
sval = savedSval; | |
} | |
} | |
private void readQuotedPart() throws IOException { | |
startQuotedMode(); | |
final StringBuilder sb = new StringBuilder(); | |
fieldStarted = true; | |
for (;;) { | |
super.nextToken(); | |
if (ttype == '"') { | |
quoteChoice(sb); | |
} else if (ttype == TT_WORD && fieldStarted) { | |
sb.append(sval); | |
} else { | |
break; // eof, eol, fs, or non-adjacent quote | |
} | |
} | |
startNextField(sb.toString()); | |
endQuotedMode(); | |
} | |
private void quoteChoice(final StringBuilder sb) { | |
if (!fieldStarted) { | |
sb.append('"'); | |
fieldStarted = true; | |
startQuotedMode(); | |
} else { | |
fieldStarted = false; | |
endQuotedMode(); | |
} | |
} | |
private void startQuotedMode() { | |
enableFS(false); | |
ordinaryChar('"'); | |
wordChars('\n', '\n'); | |
wordChars('\r', '\r'); | |
} | |
private void endQuotedMode() { | |
enableFS(true); | |
whitespaceChars('\n', '\n'); | |
whitespaceChars('\r', '\r'); | |
} | |
private void recoverUnquotedTail() throws IOException { | |
for (;;) { | |
if (ttype == TT_WORD) { | |
savedSval = savedSval.concat(sval); | |
} else if (ttype == '"') { | |
savedSval = savedSval.concat("\""); | |
} else { | |
savedTtypes.push(ttype); | |
break; | |
} | |
super.nextToken(); | |
} | |
} | |
private void resetLine() { | |
if (nFields == 0) { | |
throw new RuntimeException("Can't happen"); | |
} | |
if (nFields >= remainderField && remainderField > 0) { | |
enableFS(true); | |
} | |
nFields = 0; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.foo.csvtokenizer; | |
import java.io.Reader; | |
import java.util.List; | |
public interface PushingCsvParser { | |
char DEFAULT_FS = '~'; | |
void onRecord(List<String> values, long lineno) throws Exception; | |
void parse(Reader reader) throws Exception; | |
int getRemainderField(); | |
void setRemainderField(int remainderField); | |
boolean isQuotedFields(); | |
void setQuotedFields(boolean quotedFields); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.foo.csvtokenizer; | |
import static java.util.Arrays.asList; | |
import static org.junit.Assert.assertEquals; | |
import java.io.StringReader; | |
import java.util.ArrayList; | |
import java.util.List; | |
import org.junit.Before; | |
import org.junit.Test; | |
public class TestCsvParser { | |
@Test | |
public void test0ZeroLengthFile() throws Exception { | |
// Since there should be a way to store zero rows in csv, we treat zero | |
// length files as having no rows | |
doIt(""); | |
} | |
@Test | |
public void test100() throws Exception { | |
assertEquals(Integer.MAX_VALUE, parser.getRemainderField()); | |
doIt(" ", asList(" ")); | |
} | |
@Test | |
public void test200() throws Exception { | |
doIt("\n", asList("")); | |
} | |
@Test | |
public void test300() throws Exception { | |
doIt("~", asList("", "")); | |
} | |
@Test | |
public void test400() throws Exception { | |
doIt("~~", asList("", "", "")); | |
} | |
@Test | |
public void test500() throws Exception { | |
parser.setRemainderField(1); | |
doIt("~~\n~~", asList("", "~"), asList("", "~")); | |
} | |
@Test | |
public void test600() throws Exception { | |
parser.setRemainderField(0); | |
doIt("~~\n~~", asList("~~"), asList("~~")); | |
} | |
@Test | |
public void test700() throws Exception { | |
doIt("a~", asList("a", "")); | |
} | |
@Test | |
public void test800() throws Exception { | |
doIt("\n\na", asList(""), asList(""), asList("a")); | |
} | |
@Test | |
public void test900() throws Exception { | |
doIt( | |
"'one''one~one\none'~two\nthree~four".replace('\'', '"'), | |
asList("one\"one~one\none", "two"), | |
asList("three", "four")); | |
} | |
@Test | |
public void test950() throws Exception { | |
parser.setQuotedFields(false); | |
doIt( | |
"'one''one~one\none'~two\nthree~four".replace('\'', '"'), | |
asList("\"one\"\"one", "one"), | |
asList("one\"", "two"), | |
asList("three", "four")); | |
} | |
@Test | |
public void test1000() throws Exception { | |
doIt("'ab''cd'~x".replace('\'', '"'), asList("ab\"cd", "x")); | |
} | |
@Test | |
public void test1100() throws Exception { | |
doIt("'ab''cd~x'~x".replace('\'', '"'), asList("ab\"cd~x", "x")); | |
} | |
@Test | |
public void test1200() throws Exception { | |
doIt("'ab'cd'ef''g~x".replace('\'', '"'), asList("abcd\"ef\"\"g", "x")); | |
} | |
@Test | |
public void test1300() throws Exception { | |
doIt("'ab'cd~x".replace('\'', '"'), asList("abcd", "x")); | |
} | |
@Test | |
public void test1400() throws Exception { | |
doIt("ab'cd~x".replace('\'', '"'), asList("ab\"cd", "x")); | |
} | |
@Test | |
public void test1500() throws Exception { | |
doIt("ab''cd~x".replace('\'', '"'), asList("ab\"\"cd", "x")); | |
} | |
@SafeVarargs | |
private final void doIt(final String s, final List<String>... expectedRows) throws Exception { | |
parser.parse(new StringReader(s)); | |
assertEquals(asList(expectedRows), allRows); | |
} | |
private final List<List<String>> allRows = new ArrayList<>(); | |
@Before | |
public void setUp() { | |
allRows.clear(); | |
{ | |
parser = new AbstractPushingCsvParser() { | |
@Override | |
public void onRecord(final List<String> values, final long lineno) | |
throws Exception { | |
allRows.add(values); | |
} | |
}; | |
} | |
} | |
private PushingCsvParser parser; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment