Last active
January 1, 2016 14:19
-
-
Save TakahikoKawasaki/8156948 to your computer and use it in GitHub Desktop.
Count up letters, bytes in UTF-8 and surrogate pairs in JavaScript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<!-- | |
* Copyright (C) 2013 Neo Visionaries Inc. | |
* | |
* Licensed under the Apache License, Version 2.0 (the "License"); | |
* you may not use this file except in compliance with the License. | |
* You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
--> | |
<head> | |
<meta http-equiv="Content-Type" value="text/html;charset=UTF-8"/> | |
<meta name="author" content="Takahiko Kawasaki"> | |
<title>Count up letters, bytes in UTF-8 and surrogate pairs in JavaScript</title> | |
<script type="text/javascript"> | |
function compute_bytes_in_utf8(codePoint) | |
{ | |
// Unicode code points and their corresponding values | |
// encoded in UTF-16BE are identical except code points | |
// that are higher than U+FFFF. | |
if (codePoint <= 0x007F) | |
{ | |
// U+0000 - U+007F: 1 bytes in UTF-8. | |
return 1; | |
} | |
else if (codePoint <= 0x07FF) | |
{ | |
// U+0080 - U+07FF: 2 bytes in UTF-8. | |
return 2; | |
} | |
else if (codePoint <= 0xD7FF) | |
{ | |
// U+0800 - U+D7FF: 3 bytes in UTF-8. | |
return 3; | |
} | |
else if (codePoint <= 0xDFFF) | |
{ | |
// 0xD800 - 0xDBFF: High surrogates. | |
// 0xDC00 - 0xDFFF: Low surrogates. | |
// | |
// The range represented by surrogate pairs is | |
// U+10000 - U+10FFFF, and characters in the range | |
// consume 4 bytes in UTF-8. Luckily, 4 can be divided | |
// by 2 (2 here means 1 high surrogate + 1 low surrogate), | |
// so this implementation returns 2 (= 4 / 2) here. | |
return 2; | |
} | |
else if (codePoint <= 0xFFFF) | |
{ | |
// U+E000 - U+FFFF: 3 bytes in UTF-8. | |
return 3; | |
} | |
else | |
{ | |
// U+10000 - ...: This won't happen in UTF-16. | |
return 0; | |
} | |
} | |
function count_up() | |
{ | |
// HTML elements for input and output. | |
var input = document.getElementById("input").value; | |
var outputLetters = document.getElementById("outputLetters"); | |
var outputBytes = document.getElementById("outputBytes") | |
var outputPairs = document.getElementById("outputPairs") | |
// Counters for letters, bytes in UTF-8 and surrogate pairs. | |
var nLetters = 0; | |
var nBytes = 0; | |
var nPairs = 0; | |
// For each code points in the input string. | |
for (var i = 0; i < input.length; ++i) | |
{ | |
// Get the code point of the character at the position. | |
// | |
// Note that charCodeAt() always returns a value that is | |
// less than 65,536. Higher code points (= U+10000 and | |
// higher) are represented by surrogate pairs. | |
var codePoint = input.charCodeAt(i); | |
// If the code point is not in the range of low surrogates. | |
if (codePoint <= 0xDBFF || 0xE000 <= codePoint) | |
{ | |
// Count up the number of letters. | |
++nLetters; | |
} | |
// If the code point is in the range of high surrogates. | |
if (0xD800 <= codePoint && codePoint <= 0xDBFF) | |
{ | |
// Count up the number of surrogate pairs. | |
++nPairs; | |
} | |
// Compute the number of bytes when the code point is | |
// encoded in UTF-8. | |
nBytes += compute_bytes_in_utf8(codePoint); | |
} | |
// Write results. | |
outputLetters.innerHTML = nLetters; | |
outputBytes.innerHTML = nBytes; | |
outputPairs.innerHTML = nPairs; | |
} | |
</script> | |
<body> | |
<!-- Input --> | |
<input id="input" type="text" onInput="count_up()"><br/> | |
<!-- Output: Number of letters--> | |
<span id="outputLetters">0</span> letter(s).<br/> | |
<!-- Output: Number of bytes in UTF-8 --> | |
<span id="outputBytes">0</span> byte(s) in UTF-8.<br/> | |
<!-- Output: Number of surrogate pairs --> | |
<span id="outputPairs">0</span> surrogate pair(s).<br/> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment