Elixir has some of the best unicode support using utf-8 encoding. It supports upto 6-digit hexadecimal escape sequences, such as \x{100000}. As part of the testing for intellij-elixir, I've started to support intellij-elixir producing what it thinks is the quoted form of the Elixir it parses. I can then compare intellij-elixir's quoted form to Elixir's native quoted form by sending a request to intellij_elixir over Erlang's JInterface. JInterace has a OtpErlangString which represents an Erlang string, or in other words, an Elixir CharList. There is a OtpErlangString constructor that takes java String as an argument, so I've been building up CharList in ElixirPsiImplUtil.java using StringBuilder.appendCodePoint. This is where the problem occured, I was getting mismatched quoting for the escape sequennce \x{100000} in EscapeSequence.ex. From Java I got \uDBC0\uDC00 in quoted.java.txt, but the correct quoting is 1048576 in quoted.ex. 1048576 is just 0x100000 in decimal. So, what's happening in Java? Well, Java using UTF-16 and will convert any unicode codepoint over 0xFFFF to a surrogate pair of two chars, in this case mapping 0x100000 to 0xDBC0 and 0xDC00. The solution is to not represent the argument to OtpErlangString as a collection of integers that can be converted to an OtpErlangList, which the OtpErlangString constructor also accepts. Using this more direct approach of representing a CharList as a code point list in Java bypasses the surrogate pair problem.
Created
January 9, 2015 02:54
-
-
Save KronicDeth/e7d41aee4127fe7dea89 to your computer and use it in GitHub Desktop.
6-hexadecimal digits escape sequences in Elixir over JInterface
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| public class ElixirPsiImplUtil { | |
| // @return -1 if codePoint cannot be parsed. | |
| public static int codePoint(@NotNull ElixirEscapedCharacter escapedCharacter) { | |
| ASTNode[] escapedCharacterTokens = escapedCharacter | |
| .getNode() | |
| .getChildren(TokenSet.create(ElixirTypes.ESCAPED_CHARACTER_TOKEN)); | |
| int parsedCodePoint = -1; | |
| if (escapedCharacterTokens.length == 1) { | |
| ASTNode escapedCharacterToken = escapedCharacterTokens[0]; | |
| String formattedEscapedCharacter = escapedCharacterToken.getText(); | |
| int formattedCodePoint = formattedEscapedCharacter.codePointAt(0); | |
| // see https://github.com/elixir-lang/elixir/blob/de39bbaca277002797e52ffbde617ace06233a2b/lib/elixir/src/elixir_interpolation.erl#L130-L142 | |
| switch (formattedCodePoint) { | |
| case '0': | |
| parsedCodePoint = 0; | |
| break; | |
| case 'a': | |
| parsedCodePoint = 7; | |
| break; | |
| case 'b': | |
| parsedCodePoint = 8; | |
| break; | |
| case 'd': | |
| parsedCodePoint = 127; | |
| break; | |
| case 'e': | |
| parsedCodePoint = 27; | |
| break; | |
| case 'f': | |
| parsedCodePoint = 12; | |
| break; | |
| case 'n': | |
| parsedCodePoint = 10; | |
| break; | |
| case 'r': | |
| parsedCodePoint = 13; | |
| break; | |
| case 's': | |
| parsedCodePoint = 32; | |
| break; | |
| case 't': | |
| parsedCodePoint = 9; | |
| break; | |
| case 'v': | |
| parsedCodePoint = 11; | |
| break; | |
| default: | |
| parsedCodePoint = formattedCodePoint; | |
| } | |
| } | |
| return parsedCodePoint; | |
| } | |
| // @return -1 if codePoint cannot be parsed. | |
| public static int codePoint(@NotNull EscapedHexadecimalDigits hexadecimalEscapeSequence) { | |
| ASTNode[] validHexadecimalDigitsArray = hexadecimalEscapeSequence | |
| .getNode() | |
| .getChildren( | |
| TokenSet.create(ElixirTypes.VALID_HEXADECIMAL_DIGITS) | |
| ); | |
| int parsedCodePoint = -1; | |
| if (validHexadecimalDigitsArray.length == 1) { | |
| ASTNode validHexadecimalDigits = validHexadecimalDigitsArray[0]; | |
| String formattedHexadecimalDigits = validHexadecimalDigits.getText(); | |
| parsedCodePoint = Integer.parseInt(formattedHexadecimalDigits, 16); | |
| } | |
| return parsedCodePoint; | |
| } | |
| // @return -1 if codePoint cannot be parsed. | |
| public static int codePoint(@NotNull ElixirHexadecimalEscapeSequence hexadecimalEscapeSequence) { | |
| EscapedHexadecimalDigits escapedHexadecimalDigits = hexadecimalEscapeSequence.getEnclosedHexadecimalEscapeSequence(); | |
| int parsedCodePoint = -1; | |
| if (escapedHexadecimalDigits == null) { | |
| escapedHexadecimalDigits = hexadecimalEscapeSequence.getOpenHexadecimalEscapeSequence(); | |
| } | |
| if (escapedHexadecimalDigits != null) { | |
| parsedCodePoint = escapedHexadecimalDigits.codePoint(); | |
| } | |
| return parsedCodePoint; | |
| } | |
| @Contract(pure = true) | |
| @NotNull | |
| public static OtpErlangObject quote(@NotNull final ElixirCharListHeredoc charListHeredoc) { | |
| ElixirCharListHeredocPrefix charListHeredocPrefix = charListHeredoc.getCharListHeredocPrefix(); | |
| int prefixLength = charListHeredocPrefix.getTextLength(); | |
| Deque<ASTNode> alignedNodeDeque = new LinkedList<ASTNode>(); | |
| List<ElixirCharListHeredocLine> charListHeredocLineList = charListHeredoc.getCharListHeredocLineList(); | |
| for (ElixirCharListHeredocLine line : charListHeredocLineList) { | |
| queueChildNodes(line, prefixLength, alignedNodeDeque); | |
| } | |
| Queue<ASTNode> mergedNodeQueue = mergeCharListFragments(alignedNodeDeque, charListHeredoc.getManager()); | |
| ASTNode[] mergedNodes = new ASTNode[mergedNodeQueue.size()]; | |
| mergedNodeQueue.toArray(mergedNodes); | |
| return quotedInterpolatedCharListBodyChildNodes(charListHeredoc, mergedNodes); | |
| } | |
| protected static OtpErlangObject quotedInterpolatedCharListBodyChildNodes(PsiElement anchor, ASTNode... children) { | |
| OtpErlangObject quoted; | |
| final int childCount = children.length; | |
| if (childCount == 0) { | |
| // an empty CharList is just an empty list | |
| quoted = new OtpErlangList(); | |
| } else if (childCount == 1) { | |
| ASTNode child = children[0]; | |
| if (child.getElementType() == ElixirTypes.CHAR_LIST_FRAGMENT) { | |
| final String text = child.getText(); | |
| quoted = new OtpErlangString(text); | |
| } else { | |
| throw new NotImplementedException("Can't quote ElixirInterpolatedCharListBody with one child that isn't a CHAR_LIST_FRAGMENT"); | |
| } | |
| } else { | |
| OtpErlangList interpolatedCharListBodyMetadata = metadata(anchor); | |
| List<OtpErlangObject> quotedCharListList = new LinkedList<OtpErlangObject>(); | |
| StringBuilder stringAccumulator = null; | |
| for (ASTNode child : children) { | |
| IElementType elementType = child.getElementType(); | |
| if (elementType == ElixirTypes.CHAR_LIST_FRAGMENT) { | |
| if (stringAccumulator == null) { | |
| stringAccumulator = new StringBuilder(""); | |
| } | |
| stringAccumulator.append(child.getText()); | |
| } else if (elementType == ElixirTypes.ESCAPED_CHARACTER) { | |
| if (stringAccumulator == null) { | |
| stringAccumulator = new StringBuilder(""); | |
| } | |
| ElixirEscapedCharacter escapedCharacter = (ElixirEscapedCharacter) child.getPsi(); | |
| stringAccumulator.appendCodePoint( | |
| escapedCharacter.codePoint() | |
| ); | |
| } else if (elementType == ElixirTypes.HEXADECIMAL_ESCAPE_SEQUENCE) { | |
| if (stringAccumulator == null) { | |
| stringAccumulator = new StringBuilder(""); | |
| } | |
| ElixirHexadecimalEscapeSequence hexadecimalEscapeSequence = (ElixirHexadecimalEscapeSequence) child.getPsi(); | |
| stringAccumulator.appendCodePoint( | |
| hexadecimalEscapeSequence.codePoint() | |
| ); | |
| } else if (elementType == ElixirTypes.INTERPOLATION) { | |
| if (stringAccumulator != null) { | |
| quotedCharListList.add(elixirString(stringAccumulator.toString())); | |
| stringAccumulator = null; | |
| } | |
| ElixirInterpolation childElement = (ElixirInterpolation) child.getPsi(); | |
| quotedCharListList.add(childElement.quote()); | |
| } else { | |
| throw new NotImplementedException("Can quote only CHAR_LIST_FRAGMENT and INTERPOLATION"); | |
| } | |
| } | |
| // can be represented as a pure Erlang string (Elixir CharList) | |
| if (stringAccumulator != null && quotedCharListList.isEmpty()) { | |
| quoted = new OtpErlangString(stringAccumulator.toString()); | |
| } else { | |
| if (stringAccumulator != null) { | |
| quotedCharListList.add(elixirString(stringAccumulator.toString())); | |
| } | |
| OtpErlangObject[] quotedStringElements = new OtpErlangObject[quotedCharListList.size()]; | |
| quotedCharListList.toArray(quotedStringElements); | |
| OtpErlangTuple binaryConstruction = quotedFunctionCall("<<>>", interpolatedCharListBodyMetadata, quotedStringElements); | |
| quoted = quotedFunctionCall( | |
| "String", | |
| "to_char_list", | |
| interpolatedCharListBodyMetadata, | |
| binaryConstruction | |
| ); | |
| } | |
| } | |
| return quoted; | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ''' | |
| \''' | |
| \#{} | |
| \" | |
| \0 | |
| \1 | |
| \a | |
| \b | |
| \d | |
| \e | |
| \f | |
| \b | |
| \r | |
| \s | |
| \v | |
| \x12 | |
| \x{100000} | |
| ''' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [39, 39, 39, 10, 35, 123, 125, 10, 34, 10, 0, 10, 49, 10, 7, 10, 8, 10, 127, 10, | |
| 27, 10, 12, 10, 8, 10, 13, 10, 32, 10, 11, 10, 18, 10, 1048576, 10] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ''' 39 | |
| ''' 39 | |
| ''' 39 | |
| '\n' 10 | |
| '#' 35 | |
| '{' 123 | |
| '}' 125 | |
| '\n' 10 | |
| '\"' 34 | |
| '\n' 10 | |
| '\u0000' 0 | |
| '\n' 10 | |
| '1' 49 | |
| '\n' 10 | |
| '\u0007' 7 | |
| '\n' 10 | |
| '\b' 8 | |
| '\n' 10 | |
| '\u007F' 127 | |
| '\n' 10 | |
| '\u001B' 27 | |
| '\n' 10 | |
| '\f' 12 | |
| '\n' 10 | |
| '\b' 8 | |
| '\n' 10 | |
| '\r' 13 | |
| '\n' 10 | |
| ' ' 32 | |
| '\n' 10 | |
| '\u000B' 11 | |
| '\n' 10 | |
| '\u0012' 18 | |
| '\n' 10 | |
| '\uDBC0' 56256 | |
| '\uDC00' 56320 | |
| '}' 125 | |
| '\n' 10 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| quote do: ''' | |
| \''' | |
| \#{} | |
| \" | |
| \0 | |
| \1 | |
| \a | |
| \b | |
| \d | |
| \e | |
| \f | |
| \b | |
| \r | |
| \s | |
| \v | |
| \x12 | |
| \x{100000} | |
| ''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment