Skip to content

Instantly share code, notes, and snippets.

@KronicDeth
Created January 9, 2015 02:54
Show Gist options
  • Save KronicDeth/e7d41aee4127fe7dea89 to your computer and use it in GitHub Desktop.
Save KronicDeth/e7d41aee4127fe7dea89 to your computer and use it in GitHub Desktop.
6-hexadecimal digits escape sequences in Elixir over JInterface
public class ElixirPsiImplUtil {
// @return -1 if codePoint cannot be parsed.
public static int codePoint(@NotNull ElixirEscapedCharacter escapedCharacter) {
ASTNode[] escapedCharacterTokens = escapedCharacter
.getNode()
.getChildren(TokenSet.create(ElixirTypes.ESCAPED_CHARACTER_TOKEN));
int parsedCodePoint = -1;
if (escapedCharacterTokens.length == 1) {
ASTNode escapedCharacterToken = escapedCharacterTokens[0];
String formattedEscapedCharacter = escapedCharacterToken.getText();
int formattedCodePoint = formattedEscapedCharacter.codePointAt(0);
// see https://github.com/elixir-lang/elixir/blob/de39bbaca277002797e52ffbde617ace06233a2b/lib/elixir/src/elixir_interpolation.erl#L130-L142
switch (formattedCodePoint) {
case '0':
parsedCodePoint = 0;
break;
case 'a':
parsedCodePoint = 7;
break;
case 'b':
parsedCodePoint = 8;
break;
case 'd':
parsedCodePoint = 127;
break;
case 'e':
parsedCodePoint = 27;
break;
case 'f':
parsedCodePoint = 12;
break;
case 'n':
parsedCodePoint = 10;
break;
case 'r':
parsedCodePoint = 13;
break;
case 's':
parsedCodePoint = 32;
break;
case 't':
parsedCodePoint = 9;
break;
case 'v':
parsedCodePoint = 11;
break;
default:
parsedCodePoint = formattedCodePoint;
}
}
return parsedCodePoint;
}
// @return -1 if codePoint cannot be parsed.
public static int codePoint(@NotNull EscapedHexadecimalDigits hexadecimalEscapeSequence) {
ASTNode[] validHexadecimalDigitsArray = hexadecimalEscapeSequence
.getNode()
.getChildren(
TokenSet.create(ElixirTypes.VALID_HEXADECIMAL_DIGITS)
);
int parsedCodePoint = -1;
if (validHexadecimalDigitsArray.length == 1) {
ASTNode validHexadecimalDigits = validHexadecimalDigitsArray[0];
String formattedHexadecimalDigits = validHexadecimalDigits.getText();
parsedCodePoint = Integer.parseInt(formattedHexadecimalDigits, 16);
}
return parsedCodePoint;
}
// @return -1 if codePoint cannot be parsed.
public static int codePoint(@NotNull ElixirHexadecimalEscapeSequence hexadecimalEscapeSequence) {
EscapedHexadecimalDigits escapedHexadecimalDigits = hexadecimalEscapeSequence.getEnclosedHexadecimalEscapeSequence();
int parsedCodePoint = -1;
if (escapedHexadecimalDigits == null) {
escapedHexadecimalDigits = hexadecimalEscapeSequence.getOpenHexadecimalEscapeSequence();
}
if (escapedHexadecimalDigits != null) {
parsedCodePoint = escapedHexadecimalDigits.codePoint();
}
return parsedCodePoint;
}
@Contract(pure = true)
@NotNull
public static OtpErlangObject quote(@NotNull final ElixirCharListHeredoc charListHeredoc) {
ElixirCharListHeredocPrefix charListHeredocPrefix = charListHeredoc.getCharListHeredocPrefix();
int prefixLength = charListHeredocPrefix.getTextLength();
Deque<ASTNode> alignedNodeDeque = new LinkedList<ASTNode>();
List<ElixirCharListHeredocLine> charListHeredocLineList = charListHeredoc.getCharListHeredocLineList();
for (ElixirCharListHeredocLine line : charListHeredocLineList) {
queueChildNodes(line, prefixLength, alignedNodeDeque);
}
Queue<ASTNode> mergedNodeQueue = mergeCharListFragments(alignedNodeDeque, charListHeredoc.getManager());
ASTNode[] mergedNodes = new ASTNode[mergedNodeQueue.size()];
mergedNodeQueue.toArray(mergedNodes);
return quotedInterpolatedCharListBodyChildNodes(charListHeredoc, mergedNodes);
}
protected static OtpErlangObject quotedInterpolatedCharListBodyChildNodes(PsiElement anchor, ASTNode... children) {
OtpErlangObject quoted;
final int childCount = children.length;
if (childCount == 0) {
// an empty CharList is just an empty list
quoted = new OtpErlangList();
} else if (childCount == 1) {
ASTNode child = children[0];
if (child.getElementType() == ElixirTypes.CHAR_LIST_FRAGMENT) {
final String text = child.getText();
quoted = new OtpErlangString(text);
} else {
throw new NotImplementedException("Can't quote ElixirInterpolatedCharListBody with one child that isn't a CHAR_LIST_FRAGMENT");
}
} else {
OtpErlangList interpolatedCharListBodyMetadata = metadata(anchor);
List<OtpErlangObject> quotedCharListList = new LinkedList<OtpErlangObject>();
StringBuilder stringAccumulator = null;
for (ASTNode child : children) {
IElementType elementType = child.getElementType();
if (elementType == ElixirTypes.CHAR_LIST_FRAGMENT) {
if (stringAccumulator == null) {
stringAccumulator = new StringBuilder("");
}
stringAccumulator.append(child.getText());
} else if (elementType == ElixirTypes.ESCAPED_CHARACTER) {
if (stringAccumulator == null) {
stringAccumulator = new StringBuilder("");
}
ElixirEscapedCharacter escapedCharacter = (ElixirEscapedCharacter) child.getPsi();
stringAccumulator.appendCodePoint(
escapedCharacter.codePoint()
);
} else if (elementType == ElixirTypes.HEXADECIMAL_ESCAPE_SEQUENCE) {
if (stringAccumulator == null) {
stringAccumulator = new StringBuilder("");
}
ElixirHexadecimalEscapeSequence hexadecimalEscapeSequence = (ElixirHexadecimalEscapeSequence) child.getPsi();
stringAccumulator.appendCodePoint(
hexadecimalEscapeSequence.codePoint()
);
} else if (elementType == ElixirTypes.INTERPOLATION) {
if (stringAccumulator != null) {
quotedCharListList.add(elixirString(stringAccumulator.toString()));
stringAccumulator = null;
}
ElixirInterpolation childElement = (ElixirInterpolation) child.getPsi();
quotedCharListList.add(childElement.quote());
} else {
throw new NotImplementedException("Can quote only CHAR_LIST_FRAGMENT and INTERPOLATION");
}
}
// can be represented as a pure Erlang string (Elixir CharList)
if (stringAccumulator != null && quotedCharListList.isEmpty()) {
quoted = new OtpErlangString(stringAccumulator.toString());
} else {
if (stringAccumulator != null) {
quotedCharListList.add(elixirString(stringAccumulator.toString()));
}
OtpErlangObject[] quotedStringElements = new OtpErlangObject[quotedCharListList.size()];
quotedCharListList.toArray(quotedStringElements);
OtpErlangTuple binaryConstruction = quotedFunctionCall("<<>>", interpolatedCharListBodyMetadata, quotedStringElements);
quoted = quotedFunctionCall(
"String",
"to_char_list",
interpolatedCharListBodyMetadata,
binaryConstruction
);
}
}
return quoted;
}
}
'''
\'''
\#{}
\"
\0
\1
\a
\b
\d
\e
\f
\b
\r
\s
\v
\x12
\x{100000}
'''

Elixir has some of the best unicode support using utf-8 encoding. It supports upto 6-digit hexadecimal escape sequences, such as \x{100000}. As part of the testing for intellij-elixir, I've started to support intellij-elixir producing what it thinks is the quoted form of the Elixir it parses. I can then compare intellij-elixir's quoted form to Elixir's native quoted form by sending a request to intellij_elixir over Erlang's JInterface. JInterace has a OtpErlangString which represents an Erlang string, or in other words, an Elixir CharList. There is a OtpErlangString constructor that takes java String as an argument, so I've been building up CharList in ElixirPsiImplUtil.java using StringBuilder.appendCodePoint. This is where the problem occured, I was getting mismatched quoting for the escape sequennce \x{100000} in EscapeSequence.ex. From Java I got \uDBC0\uDC00 in quoted.java.txt, but the correct quoting is 1048576 in quoted.ex. 1048576 is just 0x100000 in decimal. So, what's happening in Java? Well, Java using UTF-16 and will convert any unicode codepoint over 0xFFFF to a surrogate pair of two chars, in this case mapping 0x100000 to 0xDBC0 and 0xDC00. The solution is to not represent the argument to OtpErlangString as a collection of integers that can be converted to an OtpErlangList, which the OtpErlangString constructor also accepts. Using this more direct approach of representing a CharList as a code point list in Java bypasses the surrogate pair problem.

[39, 39, 39, 10, 35, 123, 125, 10, 34, 10, 0, 10, 49, 10, 7, 10, 8, 10, 127, 10,
27, 10, 12, 10, 8, 10, 13, 10, 32, 10, 11, 10, 18, 10, 1048576, 10]
''' 39
''' 39
''' 39
'\n' 10
'#' 35
'{' 123
'}' 125
'\n' 10
'\"' 34
'\n' 10
'\u0000' 0
'\n' 10
'1' 49
'\n' 10
'\u0007' 7
'\n' 10
'\b' 8
'\n' 10
'\u007F' 127
'\n' 10
'\u001B' 27
'\n' 10
'\f' 12
'\n' 10
'\b' 8
'\n' 10
'\r' 13
'\n' 10
' ' 32
'\n' 10
'\u000B' 11
'\n' 10
'\u0012' 18
'\n' 10
'\uDBC0' 56256
'\uDC00' 56320
'}' 125
'\n' 10
quote do: '''
\'''
\#{}
\"
\0
\1
\a
\b
\d
\e
\f
\b
\r
\s
\v
\x12
\x{100000}
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment