Skip to content

Instantly share code, notes, and snippets.

@cky
Created August 19, 2012 05:32
Show Gist options
  • Save cky/3392271 to your computer and use it in GitHub Desktop.
Save cky/3392271 to your computer and use it in GitHub Desktop.
Ruby string parser (minus interpolation) using Java enums
package nz.kiwi.chris.j7gs.types;
import com.google.common.base.Preconditions;
public class GSString implements Comparable<GSString> {
private final String value;
public GSString(String value) {
this.value = value;
}
@Override
public int hashCode() {
return value.hashCode();
}
@Override
public boolean equals(Object obj) {
return obj instanceof GSString && value.equals(((GSString) obj).value);
}
@Override
public int compareTo(GSString rhs) {
return value.compareTo(rhs.value);
}
@Override
public String toString() {
return value;
}
/**
* Parses a single- or double-quoted string according to Ruby rules,
* except for interpolation, which is not implemented.
*
* @param str the string to parse
* @return the parsed string
* @throws IllegalArgumentException if the string is not a valid
* single- or double-quoted string
*/
public static GSString parseQuoted(String str) {
ParserData data = new ParserData();
ParserState state = ParserState.START;
for (int i = 0; i < str.length(); ++i) {
state = state.handle(data, str.charAt(i));
if (data.pushback) {
--i;
data.pushback = false;
}
}
Preconditions.checkArgument(state == ParserState.END,
"String ran out inside escape");
return new GSString(data.toString());
}
private static class ParserData {
private final StringBuilder sb = new StringBuilder();
private int scratch;
private int count;
private boolean ctrl;
private boolean pushback;
public ParserData append(char c) {
sb.append(ctrl ? (char) (c & 0x9f) : c);
ctrl = false;
return this;
}
public ParserData appendCodePoint(int codePoint) {
sb.appendCodePoint(ctrl ? codePoint & 0x9f : codePoint);
ctrl = false;
return this;
}
@Override
public String toString() {
return sb.toString();
}
}
private enum ParserState {
START {
@Override
ParserState handle(ParserData data, char c) {
switch (c) {
case '\'':
return SINGLE;
case '"':
return DOUBLE;
default:
throw new IllegalArgumentException("String did not start with quote");
}
}
},
END {
@Override
ParserState handle(ParserData data, char c) {
throw new IllegalArgumentException("Unescaped quote character");
}
},
/*
* Single-quoted string states.
*/
SINGLE {
@Override
ParserState handle(ParserData data, char c) {
switch (c) {
case '\'':
return END;
case '\\':
return SINGLE_ESC;
default:
data.append(c);
return this;
}
}
},
SINGLE_ESC {
@Override
ParserState handle(ParserData data, char c) {
switch (c) {
default:
data.append('\\');
// $FALL-THROUGH$
case '\'':
case '\\':
data.append(c);
return SINGLE;
}
}
},
/*
* Double-quoted string states.
*/
DOUBLE {
@Override
ParserState handle(ParserData data, char c) {
switch (c) {
case '"':
return END;
case '\\':
return DOUBLE_ESC;
case '#':
return HASH;
default:
data.append(c);
return this;
}
}
},
DOUBLE_ESC {
@Override
ParserState handle(ParserData data, char c) {
if (simpleEscape(data, c))
return DOUBLE;
switch (c) {
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
data.pushback = true;
data.scratch = 0;
data.count = 0;
return OCTAL;
case 'x':
data.scratch = 0;
data.count = 0;
return HEX;
case 'u':
data.scratch = 0;
data.count = 0;
return UNICODE_INIT;
case '\r':
return CR;
case '\n':
return DOUBLE;
case 'C':
return BIG_C;
case 'c':
return CONTROL;
default:
data.append(c);
return DOUBLE;
}
}
},
OCTAL {
@Override
ParserState handle(ParserData data, char c) {
return parseNumber(data, c, 3, 1, 3, DOUBLE);
}
},
HEX {
@Override
ParserState handle(ParserData data, char c) {
return parseNumber(data, c, 4, 1, 2, DOUBLE);
}
},
UNICODE_INIT {
@Override
ParserState handle(ParserData data, char c) {
if (c == '{')
return UNICODE_LCURLY;
data.pushback = true;
return UNICODE4;
}
},
UNICODE4 {
@Override
ParserState handle(ParserData data, char c) {
return parseNumber(data, c, 4, 4, 4, DOUBLE);
}
},
UNICODE_LCURLY {
@Override
ParserState handle(ParserData data, char c) {
return parseNumber(data, c, 4, 1, 6, UNICODE_RCURLY);
}
},
UNICODE_RCURLY {
@Override
ParserState handle(ParserData data, char c) {
Preconditions.checkArgument(c == '}', "Invalid Unicode escape");
return DOUBLE;
}
},
CR {
@Override
ParserState handle(ParserData data, char c) {
if (c != '\n') {
data.pushback = true;
data.append('\r');
}
return DOUBLE;
}
},
BIG_C {
@Override
ParserState handle(ParserData data, char c) {
Preconditions.checkArgument(c == '-', "Invalid control escape");
return CONTROL;
}
},
CONTROL {
@Override
ParserState handle(ParserData data, char c) {
Preconditions.checkArgument(!data.ctrl, "Invalid control escape");
switch (c) {
case '\\':
data.ctrl = true;
return DOUBLE_ESC;
case '?':
data.append('\177');
return DOUBLE;
default:
data.ctrl = true;
data.append(c);
return DOUBLE;
}
}
},
HASH {
@Override
ParserState handle(ParserData data, char c) {
switch (c) {
case '$': case '@': case '{':
throw new IllegalArgumentException("String interpolation is not implemented");
default:
data.append('#');
data.pushback = true;
return DOUBLE;
}
}
};
abstract ParserState handle(ParserData data, char c);
static boolean simpleEscape(ParserData data, char c) {
switch (c) {
case '\\': break;
case 'n': c = '\n'; break;
case 't': c = '\t'; break;
case 'r': c = '\r'; break;
case 'f': c = '\f'; break;
case 'v': c = '\13'; break;
case 'a': c = '\7'; break;
case 'e': c = '\33'; break;
case 'b': c = '\b'; break;
case 's': c = '\40'; break;
default: return false;
}
data.append(c);
return true;
}
private static int hexValue(char c) {
switch (c) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
return c - '0';
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
return c - ('A' - 10);
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
return c - ('a' - 10);
default:
return -1;
}
}
ParserState parseNumber(ParserData data, char c, int shift, int minCount,
int maxCount, ParserState nextState) {
int value = hexValue(c);
if (value >= 0 && value < (1 << shift)) {
data.scratch = (data.scratch << shift) + value;
if (++data.count < maxCount)
return this;
} else {
Preconditions.checkArgument(data.count >= minCount,
"Invalid hex/Unicode escape");
data.pushback = true;
}
data.appendCodePoint(data.scratch);
return nextState;
}
}
}
package nz.kiwi.chris.j7gs.types;
import org.testng.annotations.Test;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.fail;
public class GSStringTest {
private static void checkParseSingle(String expr, String expected) {
assertEquals(GSString.parseQuoted('\'' + expr + '\''), new GSString(expected));
}
private static void checkParseDouble(String expr, String expected) {
assertEquals(GSString.parseQuoted('"' + expr + '"'), new GSString(expected));
}
private static void checkParseDoubleForException(String expr) {
try {
GSString.parseQuoted('"' + expr + '"');
fail();
} catch (IllegalArgumentException expected) {}
}
@Test
public static void testParseSingle() {
checkParseSingle("foo bar", "foo bar");
checkParseSingle("foo\\bar", "foo\\bar");
checkParseSingle("foo\\\\bar", "foo\\bar");
checkParseSingle("foo\\'bar", "foo'bar");
checkParseSingle("foo\\\"bar", "foo\\\"bar");
checkParseSingle("foo\"bar", "foo\"bar");
}
@Test(expectedExceptions = IllegalArgumentException.class)
public static void testParseSingleWithoutLeadingQuote() {
GSString.parseQuoted("foo bar'");
}
@Test(expectedExceptions = IllegalArgumentException.class)
public static void testParseSingleWithoutTrailingQuote() {
GSString.parseQuoted("'foo bar");
}
@Test(expectedExceptions = IllegalArgumentException.class)
public static void testParseSingleWithRunawayEscape() {
GSString.parseQuoted("'foo bar\\");
}
@Test(expectedExceptions = IllegalArgumentException.class)
public static void testParseSingleWithStrayQuote() {
GSString.parseQuoted("'foo'bar'");
}
@Test
public static void testParseDouble() {
// Basic stuff
checkParseDouble("foo bar", "foo bar");
checkParseDouble("foo\\bar", "foo\bar");
checkParseDouble("foo\\\\bar", "foo\\bar");
checkParseDouble("foo\\\"bar", "foo\"bar");
checkParseDouble("foo\\'bar", "foo'bar");
checkParseDouble("foo'bar", "foo'bar");
// Escaped interpolation sequences
checkParseDouble("foo#bar", "foo#bar");
checkParseDouble("foo#\\$bar", "foo#$bar");
checkParseDouble("foo\\#$bar", "foo#$bar");
checkParseDouble("foo#\\@bar", "foo#@bar");
checkParseDouble("foo\\#@bar", "foo#@bar");
checkParseDouble("foo#\\{bar}", "foo#{bar}");
checkParseDouble("foo\\#{bar}", "foo#{bar}");
// Simple escapes
checkParseDouble("\\a\\\\a", "\7\\a");
checkParseDouble("\\b\\\\b", "\10\\b");
checkParseDouble("\\t\\\\t", "\11\\t");
checkParseDouble("\\n\\\\n", "\12\\n");
checkParseDouble("\\v\\\\v", "\13\\v");
checkParseDouble("\\f\\\\f", "\14\\f");
checkParseDouble("\\r\\\\r", "\15\\r");
checkParseDouble("\\e\\\\e", "\33\\e");
checkParseDouble("\\s\\\\s", "\40\\s");
// Octal escapes
checkParseDouble("\\0", "\0");
checkParseDouble("\\000", "\0");
checkParseDouble("\\0000", "\0" + "0");
checkParseDouble("\\377", "\377");
checkParseDouble("\\378", "\37" + "8");
checkParseDouble("\\387", "\3" + "87");
checkParseDouble("\\777", "\u01ff");
// Hexadecimal escapes
checkParseDouble("\\x0", "\0");
checkParseDouble("\\x00", "\0");
checkParseDouble("\\x000", "\0" + "0");
checkParseDouble("\\xff", "\377");
checkParseDouble("\\xfg", "\17g");
// Unicode escapes
checkParseDouble("\\u0000", "\0");
checkParseDouble("\\u00000", "\0" + "0");
checkParseDouble("\\u{0}", "\0");
checkParseDouble("\\u{000}0", "\0" + "0");
checkParseDouble("\\u{0000}", "\0");
checkParseDouble("\\u{000000}", "\0");
checkParseDouble("\\u28cd2", "\u28cd" + "2");
checkParseDouble("\\u{28cd2}", String.format("%c", 0x28cd2));
checkParseDouble("\\u10ffff", "\u10ff" + "ff");
checkParseDouble("\\u{10ff}ff", "\u10ff" + "ff");
checkParseDouble("\\u{10fff}f", String.format("%c%c", 0x10fff, 'f'));
checkParseDouble("\\u{10ffff}", String.format("%c", 0x10ffff));
// Newline escapes
checkParseDouble("\\\n", "");
checkParseDouble("\\\r\n", "");
checkParseDouble("\\\r", "\r");
// Control escapes (straightforward cases)
checkParseDouble("\\ca", "\1");
checkParseDouble("\\cA", "\1");
checkParseDouble("\\C-a", "\1");
checkParseDouble("\\C-A", "\1");
checkParseDouble("\\c?", "\177");
checkParseDouble("\\C-?", "\177");
// Control escapes (with further escapes)
checkParseDouble("\\c\\0", "\0");
checkParseDouble("\\c\\n", "\n");
checkParseDouble("\\c\\s", "\0");
checkParseDouble("\\c\\x3f", "\37"); // not \177
checkParseDouble("\\c\\77", "\37"); // not \177
checkParseDouble("\\c\\xa0", "\200");
// Control escapes (oddball cases)
checkParseDouble("\\c-a", "\15a");
checkParseDouble("\\c\"", "\2");
checkParseDouble("\\c\\\\", "\34");
checkParseDouble("\\c#$foo", "\3$foo");
checkParseDouble("\\c#@foo", "\3@foo");
checkParseDouble("\\c#{foo}", "\3{foo}");
}
@Test(expectedExceptions = IllegalArgumentException.class)
public static void testParseDoubleWithoutLeadingQuote() {
GSString.parseQuoted("foo bar\"");
}
@Test(expectedExceptions = IllegalArgumentException.class)
public static void testParseDoubleWithoutTrailingQuote() {
GSString.parseQuoted("\"foo bar");
}
@Test(expectedExceptions = IllegalArgumentException.class)
public static void testParseDoubleWithRunawayEscape() {
GSString.parseQuoted("\"foo bar\\");
}
@Test(expectedExceptions = IllegalArgumentException.class)
public static void testParseDoubleWithRunawayControl() {
GSString.parseQuoted("\"foo bar\\c");
}
@Test
public static void testParseDoubleWithStrayQuote() {
checkParseDoubleForException("foo\"bar");
}
@Test
public static void testParseDoubleInvalidHex() {
checkParseDoubleForException("\\x");
checkParseDoubleForException("\\xg");
}
@Test
public static void testParseDoubleInvalidUnicode() {
checkParseDoubleForException("\\u");
checkParseDoubleForException("\\u123");
checkParseDoubleForException("\\u123g");
checkParseDoubleForException("\\u{");
checkParseDoubleForException("\\u{}");
checkParseDoubleForException("\\u{g}");
checkParseDoubleForException("\\u{abcdg}");
checkParseDoubleForException("\\u{abcdef}");
checkParseDoubleForException("\\u{110000}");
checkParseDoubleForException("\\u{0000000}");
}
@Test
public static void testParseDoubleInvalidControl() {
checkParseDoubleForException("\\c");
checkParseDoubleForException("\\C");
checkParseDoubleForException("\\C?");
checkParseDoubleForException("\\C-");
checkParseDoubleForException("\\c\\ca");
checkParseDoubleForException("\\C-\\C-a");
}
@Test
public static void testParseDoubleInterpolation() {
checkParseDoubleForException("#$foo");
checkParseDoubleForException("#@foo");
checkParseDoubleForException("#{foo}");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment