Created
November 10, 2018 12:00
-
-
Save awwsmm/3527b995f25f74268b02e330fd4311ba to your computer and use it in GitHub Desktop.
Splits a line of sentinel-separated values into tokens
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import java.util.ArrayList; | |
| import java.util.List; | |
| import java.util.regex.Matcher; | |
| import java.util.regex.Pattern; | |
| import java.util.stream.Collectors; | |
| public class SplitXSV { | |
| public static void main (String[] args) { | |
| // split lines of (RFC 4180) standard CSV | |
| splitAndPrint("a,b, c ,d,e, f", ','); // basic CSV | |
| splitAndPrint("a,\" b \",\" \"\"c\"\" \",", ','); | |
| // split IP addresses, phone numbers, sentences, etc. | |
| splitAndPrint("144.123.222.0", '.'); // XSV with sentinel '.' | |
| splitAndPrint("1-800-944-2345", '-'); // XSV with sentinel '-' | |
| splitAndPrint("hi how are you", ' '); // XSV with sentinel ' ' | |
| splitAndPrint("\nline2\nline3", '\n'); // XSV with sentinel '\n' | |
| splitAndPrint("col1\tcol2", '\t'); // XSV with sentinel '\t' | |
| } | |
| public static String doubleLine = "========================="; | |
| public static String singleLine = "----------"; | |
| public static void splitAndPrint(String line, char sentinel) { | |
| System.out.printf("%2$s%n%ngiven sentinel: '%3$s'%n%ngiven string:%n%1$s" + | |
| "%n{%4$s}%n%1$s%n%nresult:%n%1$s%n%5$s%n%1$s%n%n", singleLine, doubleLine, | |
| sentinel, line, splitXSV(line, sentinel).stream().map(e -> | |
| String.format("{%s}", e)).collect(Collectors.joining("\n"))); | |
| } | |
| public static List<String> splitXSV (String line, char sentinel) { | |
| // if line is null or empty, return an empty array | |
| if (line == null || "".equals(line)) return new ArrayList<String>(0); | |
| // if character is not [a-zA-Z0-9], prepend "\" to escape | |
| // this means that even tokens like \t and \n can be used as sentinels | |
| String sanitized = (Character.isLetterOrDigit(sentinel)) ? | |
| String.valueOf(sentinel) : ("\\" + sentinel); | |
| // regex string for parsing lines of CSV | |
| String regex = String.format( | |
| "(?:%1$s|\\n|^)((?:\"(?:(?:\"\")*[^\"]*)*\")|(?:[^\"%1$s\\n]*)|(?:\\n|$))", | |
| sanitized); | |
| // (?:,|\\n|^) -- token MUST begin with , OR newline OR start of text | |
| // ( -- capture everything from here on | |
| // (?:\" -- (1) first option is a quoted string | |
| // (?: -- which can contain any number of | |
| // (?:\"\")* -- escaped quotes ("") | |
| // [^\"]* -- or any other non-quote character | |
| // )* -- zero or more times (empty "" is allowed) | |
| // \") -- must begin and end on a " | |
| // | -- (2) second option is a "plain" token, which contains | |
| // (?:[^\",\\n]*) -- any number of non-{" , \n} characters, 0+ times | |
| // | -- (3) third option is an empty token at the end of a line | |
| // (?:\\n|$) -- which will just be a \n or the end of the file, $ | |
| // ) -- ...that's it! | |
| // compile regex pattern and apply it to the given line | |
| Matcher matcher = Pattern.compile(regex).matcher(line); | |
| // save all of the matches to this ArrayList | |
| List<String> list = new ArrayList<>(30); // initial capacity: 30 | |
| while (matcher.find()) list.add(matcher.group(1)); | |
| // If the first character of a line is the sentinel, the regex above will | |
| // miss the fact that the first entry is null. So we add an empty String to | |
| // the beginning of the ArrayList. | |
| if (line.charAt(0) == sentinel) list.add(0, ""); | |
| return list; | |
| } // end splitXSV() | |
| } // end SplitXSV class | |
| /******************************* PROGRAM OUTPUT ******************************** | |
| ========================= | |
| given sentinel: ',' | |
| given string: | |
| ---------- | |
| {a,b, c ,d,e, f} | |
| ---------- | |
| result: | |
| ---------- | |
| {a} | |
| {b} | |
| { c } | |
| {d} | |
| {e} | |
| { f} | |
| ---------- | |
| ========================= | |
| given sentinel: ',' | |
| given string: | |
| ---------- | |
| {a," b "," ""c"" ",} | |
| ---------- | |
| result: | |
| ---------- | |
| {a} | |
| {" b "} | |
| {" ""c"" "} | |
| {} | |
| ---------- | |
| ========================= | |
| given sentinel: '.' | |
| given string: | |
| ---------- | |
| {144.123.222.0} | |
| ---------- | |
| result: | |
| ---------- | |
| {144} | |
| {123} | |
| {222} | |
| {0} | |
| ---------- | |
| ========================= | |
| given sentinel: '-' | |
| given string: | |
| ---------- | |
| {1-800-944-2345} | |
| ---------- | |
| result: | |
| ---------- | |
| {1} | |
| {800} | |
| {944} | |
| {2345} | |
| ---------- | |
| ========================= | |
| given sentinel: ' ' | |
| given string: | |
| ---------- | |
| {hi how are you} | |
| ---------- | |
| result: | |
| ---------- | |
| {hi} | |
| {how} | |
| {are} | |
| {you} | |
| ---------- | |
| ========================= | |
| given sentinel: ' | |
| ' | |
| given string: | |
| ---------- | |
| { | |
| line2 | |
| line3} | |
| ---------- | |
| result: | |
| ---------- | |
| {} | |
| {line2} | |
| {line3} | |
| ---------- | |
| ========================= | |
| given sentinel: ' ' | |
| given string: | |
| ---------- | |
| {col1 col2} | |
| ---------- | |
| result: | |
| ---------- | |
| {col1} | |
| {col2} | |
| ---------- | |
| *******************************************************************************/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment