Created
February 15, 2012 22:34
-
-
Save palianytsia/1839539 to your computer and use it in GitHub Desktop.
This class contains examples that help to understand the char type together with the Unicode encoding scheme.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* <p> | |
* This class contains examples that help to understand the <code>char</code> | |
* type together with the <strong>Unicode encoding scheme</strong>. The explanations | |
* and examples are based on "The char Type" and "Code Points and Code Units" chapters | |
* of "Core Java" book (Volume 1, 8th edition) by Cay Horstmann and Gary Cornell. | |
* </p> | |
* <p> | |
* Before proceeding to examples, read the following terminology: | |
* </p> | |
* <p> | |
* A <em>code point</em> is a code value that is associated with a character in | |
* an encoding scheme. In the Unicode standard, code points are written in | |
* hexadecimal and prefixed with U+, such as U+0041 for the code point of the | |
* letter A. Unicode has code points that are grouped into 17 | |
* <em>code planes</em>. The first code plane, called the | |
* <em>basic multilingual plane</em>, consists of the “classic” Unicode | |
* characters with code points U+0000 to U+FFFF. Sixteen additional planes, with | |
* code points U+10000 to U+10FFFF, hold the <em>supplementary characters</em>. | |
* </p> | |
* <p> | |
* The UTF-16 encoding is a method of representing all Unicode code points in a | |
* variable-length code. The characters in the basic multilingual plane are | |
* represented as 16-bit values, called <em>code units</em>. The supplementary | |
* characters are encoded as consecutive pairs of code units. Each of the values | |
* in such an encoding pair falls into a range of 2048 unused values of the | |
* basic multilingual plane, called the <em>surrogates area</em> (U+D800 to | |
* U+DBFF for the first code unit, U+DC00 to U+DFFF for the second code unit). | |
* This is rather clever, because you can immediately tell whether a code unit | |
* encodes a single character or whether it is the first or second part of a | |
* supplementary character. | |
* </p> | |
* | |
* @author Ivan Palianytsia | |
*/ | |
public class CharByExample | |
{ | |
public static void main(String[] args) | |
{ | |
example1(); | |
example2(); | |
example3(); | |
example4(); | |
} | |
/** | |
* Example 1. Unicode code units can be expressed as hexadecimal values that | |
* run from \u0000 to \uFFFF. For example, \u2122 is the trademark symbol | |
* (TM) and \u03C0 is the Greek letter pi (π). | |
*/ | |
private static void example1() | |
{ | |
System.out.println("***** Example 1 *****"); | |
System.out.println("\\u2122 is the trademark symbol \u2122"); | |
System.out.println("And \\u03C0 is is the Greek letter pi (\u03C0)"); | |
System.out.println(); | |
} | |
/** | |
* Example 2. The mathematical symbol for the set of integers Zet has code | |
* point U+1D56B and is encoded by the two code units U+D835 and U+DD6B. | |
*/ | |
private static void example2() | |
{ | |
System.out.println("***** Example 2 *****"); | |
String z = "\uD835\uDD6B"; | |
System.out.println(z + " has code point U+" | |
+ Integer.toHexString(z.codePointAt(0)).toUpperCase()); | |
System.out.println("And is encoded by the two code units U+D835 and U+DD6B"); | |
System.out.println(); | |
} | |
/** | |
* Example 3. The length method yields the number of code units required for | |
* a given string in the UTF-16 encoding. To get the true length, that is, | |
* the number of code points use <code>codePointCount</code> method of the | |
* <code>String</code> class. | |
*/ | |
private static void example3() | |
{ | |
System.out.println("***** Example 3 *****"); | |
String string = "\uD835\uDD6B is the set of integers"; | |
System.out.println("String \"" + string + "\" has length " + string.length()); | |
System.out.println("However actual number of code points (symbols) in the string is " | |
+ string.codePointCount(0, string.length())); | |
System.out.println(); | |
} | |
/** | |
* The call <code>s.charAt(n)</code> returns the code unit at position n, | |
* where n is between 0 and <code>s.length()</code> – 1. To get at the ith | |
* code point, use the statements | |
* | |
* <pre> | |
* int index = greeting.offsetByCodePoints(0, i); | |
* int cp = greeting.codePointAt(index); | |
* </pre> | |
*/ | |
private static void example4() | |
{ | |
System.out.println("***** Example 4 *****"); | |
String stringA = "Hello"; | |
String stringB = "\uD835\uDD6B is the set of integers"; | |
System.out.println("Call to charAt(1) on the String \"" + stringA | |
+ "\" returns the second character - " + stringA.charAt(1)); | |
System.out.println("However the same call on the String \"" + stringB | |
+ "\" doesn't return a space but the second code unit of " + "\uD835\uDD6B" + " - " | |
+ stringB.charAt(1)); | |
System.out.println("To avoid this problem, you should not use the char " | |
+ "type. It is too low-level."); | |
System.out.println("Right way to traverse a string is to look at each code point in turn:"); | |
int i = 0; | |
while (i < stringB.length()) | |
{ | |
int cp = stringB.codePointAt(i); | |
printCodePoint(cp); | |
if (Character.isSupplementaryCodePoint(cp)) | |
{ | |
i += 2; | |
} | |
else | |
{ | |
i++; | |
} | |
} | |
System.out.println(); | |
System.out.println("Unfortunately, the codePointAt method can't tell whether a code unit" | |
+ " is the first or second half of a supplementary character.\nIt returns the right" | |
+ " result only on the first half of a supplementary character:"); | |
for(int j = 0; j < 2; j++) { | |
int cp = "\uD835\uDD6B".codePointAt(j); | |
printCodePoint (cp); | |
System.out.println(" (" + cp + ")"); | |
} | |
} | |
/** | |
* Converts integer code of the code point to character and outputs it to the screen in square | |
* brackets. | |
*/ | |
private static void printCodePoint(int cp) | |
{ | |
System.out.print("["); | |
System.out.print(Character.toChars(cp)); | |
System.out.print("]"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment