Skip to content

Instantly share code, notes, and snippets.

@awwsmm
Created November 10, 2018 12:00
Show Gist options
  • Select an option

  • Save awwsmm/3527b995f25f74268b02e330fd4311ba to your computer and use it in GitHub Desktop.

Select an option

Save awwsmm/3527b995f25f74268b02e330fd4311ba to your computer and use it in GitHub Desktop.
Splits a line of sentinel-separated values into tokens
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class SplitXSV {
public static void main (String[] args) {
// split lines of (RFC 4180) standard CSV
splitAndPrint("a,b, c ,d,e, f", ','); // basic CSV
splitAndPrint("a,\" b \",\" \"\"c\"\" \",", ',');
// split IP addresses, phone numbers, sentences, etc.
splitAndPrint("144.123.222.0", '.'); // XSV with sentinel '.'
splitAndPrint("1-800-944-2345", '-'); // XSV with sentinel '-'
splitAndPrint("hi how are you", ' '); // XSV with sentinel ' '
splitAndPrint("\nline2\nline3", '\n'); // XSV with sentinel '\n'
splitAndPrint("col1\tcol2", '\t'); // XSV with sentinel '\t'
}
public static String doubleLine = "=========================";
public static String singleLine = "----------";
public static void splitAndPrint(String line, char sentinel) {
System.out.printf("%2$s%n%ngiven sentinel: '%3$s'%n%ngiven string:%n%1$s" +
"%n{%4$s}%n%1$s%n%nresult:%n%1$s%n%5$s%n%1$s%n%n", singleLine, doubleLine,
sentinel, line, splitXSV(line, sentinel).stream().map(e ->
String.format("{%s}", e)).collect(Collectors.joining("\n")));
}
public static List<String> splitXSV (String line, char sentinel) {
// if line is null or empty, return an empty array
if (line == null || "".equals(line)) return new ArrayList<String>(0);
// if character is not [a-zA-Z0-9], prepend "\" to escape
// this means that even tokens like \t and \n can be used as sentinels
String sanitized = (Character.isLetterOrDigit(sentinel)) ?
String.valueOf(sentinel) : ("\\" + sentinel);
// regex string for parsing lines of CSV
String regex = String.format(
"(?:%1$s|\\n|^)((?:\"(?:(?:\"\")*[^\"]*)*\")|(?:[^\"%1$s\\n]*)|(?:\\n|$))",
sanitized);
// (?:,|\\n|^) -- token MUST begin with , OR newline OR start of text
// ( -- capture everything from here on
// (?:\" -- (1) first option is a quoted string
// (?: -- which can contain any number of
// (?:\"\")* -- escaped quotes ("")
// [^\"]* -- or any other non-quote character
// )* -- zero or more times (empty "" is allowed)
// \") -- must begin and end on a "
// | -- (2) second option is a "plain" token, which contains
// (?:[^\",\\n]*) -- any number of non-{" , \n} characters, 0+ times
// | -- (3) third option is an empty token at the end of a line
// (?:\\n|$) -- which will just be a \n or the end of the file, $
// ) -- ...that's it!
// compile regex pattern and apply it to the given line
Matcher matcher = Pattern.compile(regex).matcher(line);
// save all of the matches to this ArrayList
List<String> list = new ArrayList<>(30); // initial capacity: 30
while (matcher.find()) list.add(matcher.group(1));
// If the first character of a line is the sentinel, the regex above will
// miss the fact that the first entry is null. So we add an empty String to
// the beginning of the ArrayList.
if (line.charAt(0) == sentinel) list.add(0, "");
return list;
} // end splitXSV()
} // end SplitXSV class
/******************************* PROGRAM OUTPUT ********************************
=========================
given sentinel: ','
given string:
----------
{a,b, c ,d,e, f}
----------
result:
----------
{a}
{b}
{ c }
{d}
{e}
{ f}
----------
=========================
given sentinel: ','
given string:
----------
{a," b "," ""c"" ",}
----------
result:
----------
{a}
{" b "}
{" ""c"" "}
{}
----------
=========================
given sentinel: '.'
given string:
----------
{144.123.222.0}
----------
result:
----------
{144}
{123}
{222}
{0}
----------
=========================
given sentinel: '-'
given string:
----------
{1-800-944-2345}
----------
result:
----------
{1}
{800}
{944}
{2345}
----------
=========================
given sentinel: ' '
given string:
----------
{hi how are you}
----------
result:
----------
{hi}
{how}
{are}
{you}
----------
=========================
given sentinel: '
'
given string:
----------
{
line2
line3}
----------
result:
----------
{}
{line2}
{line3}
----------
=========================
given sentinel: ' '
given string:
----------
{col1 col2}
----------
result:
----------
{col1}
{col2}
----------
*******************************************************************************/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment