Created
July 9, 2013 05:13
-
-
Save hideaki-t/5954882 to your computer and use it in GitHub Desktop.
regex based CSV parser in Java(without newline support)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.List; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class CSVParser { | |
private static final Pattern p = Pattern.compile(",|[^,\"]+|\"(?:[^\"]|\"\")*\""); | |
public static List<String> parse(final String line, final int max) { | |
if (max == 0) { | |
return Arrays.asList(line); | |
} | |
final List<String> l = new ArrayList<>(max); | |
final Matcher m = p.matcher(line); | |
while (l.size() < max && m.find()) { | |
final String v = m.group(); | |
if (v.equals(",")) { | |
continue; | |
} else if (v.startsWith("\"") && v.endsWith("\"")) { | |
l.add(v.substring(1,v.length()-1).replace("\"\"", "\"")); | |
} else { | |
l.add(v); | |
} | |
} | |
if (m.find()) { | |
l.add(line.substring(m.end())); | |
} | |
return l; | |
} | |
public static void main(String[] args) { | |
final String line = "\",\",1,2,3,\",,,\",\"hoge\"\"hoge\",abc,\"\""; | |
final String[] v = {",", "1", "2", "3", ",,,", "hoge\"hoge", "abc", ""}; | |
final int[] offset = {0, 4, 6, 8, 10, 16, 29, 33}; | |
for (int i = 0; i < 10; i++) { | |
final List<String> r = CSVParser.parse(line, i); | |
final int n = r.size() - 1; | |
assert r.size() == (i < v.length ? i + 1 : v.length): i; | |
for (int j = 0; j < n; j++) { | |
assert r.get(j).equals(v[j]); | |
} | |
assert i >= v.length || line.substring(offset[i]).equals(r.get(n)); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment