Created
August 19, 2012 05:32
-
-
Save cky/3392271 to your computer and use it in GitHub Desktop.
Ruby string parser (minus interpolation) using Java enums
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package nz.kiwi.chris.j7gs.types; | |
import com.google.common.base.Preconditions; | |
public class GSString implements Comparable<GSString> { | |
private final String value; | |
public GSString(String value) { | |
this.value = value; | |
} | |
@Override | |
public int hashCode() { | |
return value.hashCode(); | |
} | |
@Override | |
public boolean equals(Object obj) { | |
return obj instanceof GSString && value.equals(((GSString) obj).value); | |
} | |
@Override | |
public int compareTo(GSString rhs) { | |
return value.compareTo(rhs.value); | |
} | |
@Override | |
public String toString() { | |
return value; | |
} | |
/** | |
* Parses a single- or double-quoted string according to Ruby rules, | |
* except for interpolation, which is not implemented. | |
* | |
* @param str the string to parse | |
* @return the parsed string | |
* @throws IllegalArgumentException if the string is not a valid | |
* single- or double-quoted string | |
*/ | |
public static GSString parseQuoted(String str) { | |
ParserData data = new ParserData(); | |
ParserState state = ParserState.START; | |
for (int i = 0; i < str.length(); ++i) { | |
state = state.handle(data, str.charAt(i)); | |
if (data.pushback) { | |
--i; | |
data.pushback = false; | |
} | |
} | |
Preconditions.checkArgument(state == ParserState.END, | |
"String ran out inside escape"); | |
return new GSString(data.toString()); | |
} | |
private static class ParserData { | |
private final StringBuilder sb = new StringBuilder(); | |
private int scratch; | |
private int count; | |
private boolean ctrl; | |
private boolean pushback; | |
public ParserData append(char c) { | |
sb.append(ctrl ? (char) (c & 0x9f) : c); | |
ctrl = false; | |
return this; | |
} | |
public ParserData appendCodePoint(int codePoint) { | |
sb.appendCodePoint(ctrl ? codePoint & 0x9f : codePoint); | |
ctrl = false; | |
return this; | |
} | |
@Override | |
public String toString() { | |
return sb.toString(); | |
} | |
} | |
private enum ParserState { | |
START { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
switch (c) { | |
case '\'': | |
return SINGLE; | |
case '"': | |
return DOUBLE; | |
default: | |
throw new IllegalArgumentException("String did not start with quote"); | |
} | |
} | |
}, | |
END { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
throw new IllegalArgumentException("Unescaped quote character"); | |
} | |
}, | |
/* | |
* Single-quoted string states. | |
*/ | |
SINGLE { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
switch (c) { | |
case '\'': | |
return END; | |
case '\\': | |
return SINGLE_ESC; | |
default: | |
data.append(c); | |
return this; | |
} | |
} | |
}, | |
SINGLE_ESC { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
switch (c) { | |
default: | |
data.append('\\'); | |
// $FALL-THROUGH$ | |
case '\'': | |
case '\\': | |
data.append(c); | |
return SINGLE; | |
} | |
} | |
}, | |
/* | |
* Double-quoted string states. | |
*/ | |
DOUBLE { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
switch (c) { | |
case '"': | |
return END; | |
case '\\': | |
return DOUBLE_ESC; | |
case '#': | |
return HASH; | |
default: | |
data.append(c); | |
return this; | |
} | |
} | |
}, | |
DOUBLE_ESC { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
if (simpleEscape(data, c)) | |
return DOUBLE; | |
switch (c) { | |
case '0': case '1': case '2': case '3': | |
case '4': case '5': case '6': case '7': | |
data.pushback = true; | |
data.scratch = 0; | |
data.count = 0; | |
return OCTAL; | |
case 'x': | |
data.scratch = 0; | |
data.count = 0; | |
return HEX; | |
case 'u': | |
data.scratch = 0; | |
data.count = 0; | |
return UNICODE_INIT; | |
case '\r': | |
return CR; | |
case '\n': | |
return DOUBLE; | |
case 'C': | |
return BIG_C; | |
case 'c': | |
return CONTROL; | |
default: | |
data.append(c); | |
return DOUBLE; | |
} | |
} | |
}, | |
OCTAL { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
return parseNumber(data, c, 3, 1, 3, DOUBLE); | |
} | |
}, | |
HEX { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
return parseNumber(data, c, 4, 1, 2, DOUBLE); | |
} | |
}, | |
UNICODE_INIT { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
if (c == '{') | |
return UNICODE_LCURLY; | |
data.pushback = true; | |
return UNICODE4; | |
} | |
}, | |
UNICODE4 { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
return parseNumber(data, c, 4, 4, 4, DOUBLE); | |
} | |
}, | |
UNICODE_LCURLY { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
return parseNumber(data, c, 4, 1, 6, UNICODE_RCURLY); | |
} | |
}, | |
UNICODE_RCURLY { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
Preconditions.checkArgument(c == '}', "Invalid Unicode escape"); | |
return DOUBLE; | |
} | |
}, | |
CR { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
if (c != '\n') { | |
data.pushback = true; | |
data.append('\r'); | |
} | |
return DOUBLE; | |
} | |
}, | |
BIG_C { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
Preconditions.checkArgument(c == '-', "Invalid control escape"); | |
return CONTROL; | |
} | |
}, | |
CONTROL { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
Preconditions.checkArgument(!data.ctrl, "Invalid control escape"); | |
switch (c) { | |
case '\\': | |
data.ctrl = true; | |
return DOUBLE_ESC; | |
case '?': | |
data.append('\177'); | |
return DOUBLE; | |
default: | |
data.ctrl = true; | |
data.append(c); | |
return DOUBLE; | |
} | |
} | |
}, | |
HASH { | |
@Override | |
ParserState handle(ParserData data, char c) { | |
switch (c) { | |
case '$': case '@': case '{': | |
throw new IllegalArgumentException("String interpolation is not implemented"); | |
default: | |
data.append('#'); | |
data.pushback = true; | |
return DOUBLE; | |
} | |
} | |
}; | |
abstract ParserState handle(ParserData data, char c); | |
static boolean simpleEscape(ParserData data, char c) { | |
switch (c) { | |
case '\\': break; | |
case 'n': c = '\n'; break; | |
case 't': c = '\t'; break; | |
case 'r': c = '\r'; break; | |
case 'f': c = '\f'; break; | |
case 'v': c = '\13'; break; | |
case 'a': c = '\7'; break; | |
case 'e': c = '\33'; break; | |
case 'b': c = '\b'; break; | |
case 's': c = '\40'; break; | |
default: return false; | |
} | |
data.append(c); | |
return true; | |
} | |
private static int hexValue(char c) { | |
switch (c) { | |
case '0': case '1': case '2': case '3': case '4': | |
case '5': case '6': case '7': case '8': case '9': | |
return c - '0'; | |
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': | |
return c - ('A' - 10); | |
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': | |
return c - ('a' - 10); | |
default: | |
return -1; | |
} | |
} | |
ParserState parseNumber(ParserData data, char c, int shift, int minCount, | |
int maxCount, ParserState nextState) { | |
int value = hexValue(c); | |
if (value >= 0 && value < (1 << shift)) { | |
data.scratch = (data.scratch << shift) + value; | |
if (++data.count < maxCount) | |
return this; | |
} else { | |
Preconditions.checkArgument(data.count >= minCount, | |
"Invalid hex/Unicode escape"); | |
data.pushback = true; | |
} | |
data.appendCodePoint(data.scratch); | |
return nextState; | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package nz.kiwi.chris.j7gs.types; | |
import org.testng.annotations.Test; | |
import static org.testng.Assert.assertEquals; | |
import static org.testng.Assert.fail; | |
public class GSStringTest { | |
private static void checkParseSingle(String expr, String expected) { | |
assertEquals(GSString.parseQuoted('\'' + expr + '\''), new GSString(expected)); | |
} | |
private static void checkParseDouble(String expr, String expected) { | |
assertEquals(GSString.parseQuoted('"' + expr + '"'), new GSString(expected)); | |
} | |
private static void checkParseDoubleForException(String expr) { | |
try { | |
GSString.parseQuoted('"' + expr + '"'); | |
fail(); | |
} catch (IllegalArgumentException expected) {} | |
} | |
@Test | |
public static void testParseSingle() { | |
checkParseSingle("foo bar", "foo bar"); | |
checkParseSingle("foo\\bar", "foo\\bar"); | |
checkParseSingle("foo\\\\bar", "foo\\bar"); | |
checkParseSingle("foo\\'bar", "foo'bar"); | |
checkParseSingle("foo\\\"bar", "foo\\\"bar"); | |
checkParseSingle("foo\"bar", "foo\"bar"); | |
} | |
@Test(expectedExceptions = IllegalArgumentException.class) | |
public static void testParseSingleWithoutLeadingQuote() { | |
GSString.parseQuoted("foo bar'"); | |
} | |
@Test(expectedExceptions = IllegalArgumentException.class) | |
public static void testParseSingleWithoutTrailingQuote() { | |
GSString.parseQuoted("'foo bar"); | |
} | |
@Test(expectedExceptions = IllegalArgumentException.class) | |
public static void testParseSingleWithRunawayEscape() { | |
GSString.parseQuoted("'foo bar\\"); | |
} | |
@Test(expectedExceptions = IllegalArgumentException.class) | |
public static void testParseSingleWithStrayQuote() { | |
GSString.parseQuoted("'foo'bar'"); | |
} | |
@Test | |
public static void testParseDouble() { | |
// Basic stuff | |
checkParseDouble("foo bar", "foo bar"); | |
checkParseDouble("foo\\bar", "foo\bar"); | |
checkParseDouble("foo\\\\bar", "foo\\bar"); | |
checkParseDouble("foo\\\"bar", "foo\"bar"); | |
checkParseDouble("foo\\'bar", "foo'bar"); | |
checkParseDouble("foo'bar", "foo'bar"); | |
// Escaped interpolation sequences | |
checkParseDouble("foo#bar", "foo#bar"); | |
checkParseDouble("foo#\\$bar", "foo#$bar"); | |
checkParseDouble("foo\\#$bar", "foo#$bar"); | |
checkParseDouble("foo#\\@bar", "foo#@bar"); | |
checkParseDouble("foo\\#@bar", "foo#@bar"); | |
checkParseDouble("foo#\\{bar}", "foo#{bar}"); | |
checkParseDouble("foo\\#{bar}", "foo#{bar}"); | |
// Simple escapes | |
checkParseDouble("\\a\\\\a", "\7\\a"); | |
checkParseDouble("\\b\\\\b", "\10\\b"); | |
checkParseDouble("\\t\\\\t", "\11\\t"); | |
checkParseDouble("\\n\\\\n", "\12\\n"); | |
checkParseDouble("\\v\\\\v", "\13\\v"); | |
checkParseDouble("\\f\\\\f", "\14\\f"); | |
checkParseDouble("\\r\\\\r", "\15\\r"); | |
checkParseDouble("\\e\\\\e", "\33\\e"); | |
checkParseDouble("\\s\\\\s", "\40\\s"); | |
// Octal escapes | |
checkParseDouble("\\0", "\0"); | |
checkParseDouble("\\000", "\0"); | |
checkParseDouble("\\0000", "\0" + "0"); | |
checkParseDouble("\\377", "\377"); | |
checkParseDouble("\\378", "\37" + "8"); | |
checkParseDouble("\\387", "\3" + "87"); | |
checkParseDouble("\\777", "\u01ff"); | |
// Hexadecimal escapes | |
checkParseDouble("\\x0", "\0"); | |
checkParseDouble("\\x00", "\0"); | |
checkParseDouble("\\x000", "\0" + "0"); | |
checkParseDouble("\\xff", "\377"); | |
checkParseDouble("\\xfg", "\17g"); | |
// Unicode escapes | |
checkParseDouble("\\u0000", "\0"); | |
checkParseDouble("\\u00000", "\0" + "0"); | |
checkParseDouble("\\u{0}", "\0"); | |
checkParseDouble("\\u{000}0", "\0" + "0"); | |
checkParseDouble("\\u{0000}", "\0"); | |
checkParseDouble("\\u{000000}", "\0"); | |
checkParseDouble("\\u28cd2", "\u28cd" + "2"); | |
checkParseDouble("\\u{28cd2}", String.format("%c", 0x28cd2)); | |
checkParseDouble("\\u10ffff", "\u10ff" + "ff"); | |
checkParseDouble("\\u{10ff}ff", "\u10ff" + "ff"); | |
checkParseDouble("\\u{10fff}f", String.format("%c%c", 0x10fff, 'f')); | |
checkParseDouble("\\u{10ffff}", String.format("%c", 0x10ffff)); | |
// Newline escapes | |
checkParseDouble("\\\n", ""); | |
checkParseDouble("\\\r\n", ""); | |
checkParseDouble("\\\r", "\r"); | |
// Control escapes (straightforward cases) | |
checkParseDouble("\\ca", "\1"); | |
checkParseDouble("\\cA", "\1"); | |
checkParseDouble("\\C-a", "\1"); | |
checkParseDouble("\\C-A", "\1"); | |
checkParseDouble("\\c?", "\177"); | |
checkParseDouble("\\C-?", "\177"); | |
// Control escapes (with further escapes) | |
checkParseDouble("\\c\\0", "\0"); | |
checkParseDouble("\\c\\n", "\n"); | |
checkParseDouble("\\c\\s", "\0"); | |
checkParseDouble("\\c\\x3f", "\37"); // not \177 | |
checkParseDouble("\\c\\77", "\37"); // not \177 | |
checkParseDouble("\\c\\xa0", "\200"); | |
// Control escapes (oddball cases) | |
checkParseDouble("\\c-a", "\15a"); | |
checkParseDouble("\\c\"", "\2"); | |
checkParseDouble("\\c\\\\", "\34"); | |
checkParseDouble("\\c#$foo", "\3$foo"); | |
checkParseDouble("\\c#@foo", "\3@foo"); | |
checkParseDouble("\\c#{foo}", "\3{foo}"); | |
} | |
@Test(expectedExceptions = IllegalArgumentException.class) | |
public static void testParseDoubleWithoutLeadingQuote() { | |
GSString.parseQuoted("foo bar\""); | |
} | |
@Test(expectedExceptions = IllegalArgumentException.class) | |
public static void testParseDoubleWithoutTrailingQuote() { | |
GSString.parseQuoted("\"foo bar"); | |
} | |
@Test(expectedExceptions = IllegalArgumentException.class) | |
public static void testParseDoubleWithRunawayEscape() { | |
GSString.parseQuoted("\"foo bar\\"); | |
} | |
@Test(expectedExceptions = IllegalArgumentException.class) | |
public static void testParseDoubleWithRunawayControl() { | |
GSString.parseQuoted("\"foo bar\\c"); | |
} | |
@Test | |
public static void testParseDoubleWithStrayQuote() { | |
checkParseDoubleForException("foo\"bar"); | |
} | |
@Test | |
public static void testParseDoubleInvalidHex() { | |
checkParseDoubleForException("\\x"); | |
checkParseDoubleForException("\\xg"); | |
} | |
@Test | |
public static void testParseDoubleInvalidUnicode() { | |
checkParseDoubleForException("\\u"); | |
checkParseDoubleForException("\\u123"); | |
checkParseDoubleForException("\\u123g"); | |
checkParseDoubleForException("\\u{"); | |
checkParseDoubleForException("\\u{}"); | |
checkParseDoubleForException("\\u{g}"); | |
checkParseDoubleForException("\\u{abcdg}"); | |
checkParseDoubleForException("\\u{abcdef}"); | |
checkParseDoubleForException("\\u{110000}"); | |
checkParseDoubleForException("\\u{0000000}"); | |
} | |
@Test | |
public static void testParseDoubleInvalidControl() { | |
checkParseDoubleForException("\\c"); | |
checkParseDoubleForException("\\C"); | |
checkParseDoubleForException("\\C?"); | |
checkParseDoubleForException("\\C-"); | |
checkParseDoubleForException("\\c\\ca"); | |
checkParseDoubleForException("\\C-\\C-a"); | |
} | |
@Test | |
public static void testParseDoubleInterpolation() { | |
checkParseDoubleForException("#$foo"); | |
checkParseDoubleForException("#@foo"); | |
checkParseDoubleForException("#{foo}"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment