Created
April 6, 2014 22:09
-
-
Save thesnarky1/10012004 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset="utf-8"> | |
<title>JS Bin</title> | |
</head> | |
<body> | |
</body> | |
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var NICE_FORM = 0; | |
var INITIAL_FORM = 1; | |
var MEDIAL_FORM = 2; | |
var FINAL_FORM = 3; | |
var ISOLATED_FORM = 4; | |
var arabicTranslations = {}; | |
//Alif | |
arabicTranslations[1575] = [1575, 65165, 65166, 65166, 65165]; | |
//Ba | |
arabicTranslations[1576] = [1576, 65169, 65170, 65168, 65167]; | |
//Ta | |
arabicTranslations[1578] = [1578, 65175, 65176, 65174, 65173]; | |
//Tha | |
arabicTranslations[1579] = [1579, 65179, 65180, 65178, 65177]; | |
//Jim | |
arabicTranslations[1580] = [1580, 65183, 65184, 65182, 65181]; | |
//Ha | |
arabicTranslations[1581] = [1581, 65187, 65188, 65186, 65185]; | |
//Kha | |
arabicTranslations[1582] = [1582, 65191, 65192, 65190, 65189]; | |
//Dal | |
arabicTranslations[1583] = [1583, 65193, 65194, 65194, 65193]; | |
//Thal | |
arabicTranslations[1584] = [1584, 65195, 65196, 65196, 65195]; | |
//Ra | |
arabicTranslations[1585] = [1585, 65197, 65198, 65198, 65195]; | |
//Zain | |
arabicTranslations[1586] = [1586, 65199, 65200, 65200, 65199]; | |
//Seen | |
arabicTranslations[1587] = [1587, 65203, 65204, 65202, 65201]; | |
//Sheen | |
arabicTranslations[1588] = [1588, 65207, 65208, 65206, 65205]; | |
//Sod | |
arabicTranslations[1589] = [1589, 65211, 65212, 65210, 65209]; | |
//Dod | |
arabicTranslations[1590] = [1590, 65215, 65216, 65214, 65213]; | |
//Tah | |
arabicTranslations[1591] = [1591, 65219, 65220, 65218, 65217]; | |
//Thah | |
arabicTranslations[1592] = [1592, 65223, 65224, 65222, 65221]; | |
//Ayn | |
arabicTranslations[1593] = [1593, 65227, 65228, 65224, 65225]; | |
//Ghayn | |
arabicTranslations[1594] = [1594, 65231, 65232, 65230, 65229]; | |
//Fah | |
arabicTranslations[1601] = [1601, 65235, 65236, 65234, 65233]; | |
//Qaf | |
arabicTranslations[1602] = [1602, 65239, 65240, 65238, 65237]; | |
//Kaf | |
arabicTranslations[1603] = [1603, 65243, 65244, 65242, 65241]; | |
//Lam | |
arabicTranslations[1604] = [1604, 65247, 65248, 65246, 65245]; | |
//Mim | |
arabicTranslations[1605] = [1605, 65251, 65252, 65250, 65249]; | |
//Nun | |
arabicTranslations[1606] = [1606, 65255, 65256, 65254, 65253]; | |
//Heh | |
arabicTranslations[1607] = [1607, 65259, 65260, 65258, 65257]; | |
//Waw | |
arabicTranslations[1608] = [1608, 65261, 65262, 65262, 65261]; | |
//Ya | |
arabicTranslations[1610] = [1610, 65267, 65268, 65266, 65265]; | |
var nonConnectors = [1575, 1583, 1584, 1585, 1586, 1608]; | |
var getRealCharCodes = function(str) { | |
//Can't change an empty or one-char string | |
if(str.length === 0 || str.length == 1) { console.log("Empty string"); return str; } | |
//No arabic in here to change, let's be quick about it | |
if(!/[\u0600-\u06FF]/.test(str)) { console.log("No arabic here"); return str; } | |
//console.log("Changing " + str); | |
var toReturn = ""; | |
var initial = true; | |
var final = false; | |
for(var x = 0; x < str.length; x++) { | |
var tmpCharCode = str.charCodeAt(x); | |
var tmpChar = str.charAt(x); | |
//console.log("Checking " + tmpChar); | |
//We have no way to translate | |
if(arabicTranslations[tmpCharCode] === undefined) { toReturn += tmpChar; console.log("Skipping unknown character: " + tmpChar + "-" + tmpCharCode); continue; } | |
//If we're the last letter, we must be final | |
if(x == str.length - 1) { | |
final = true; | |
//Or if the next letter after us is not an Arabic letter we know how to deal with | |
} else if(arabicTranslations[str.charCodeAt(x + 1)] === undefined) { | |
final = true; | |
} | |
//Add this character | |
if(initial && final) { | |
//console.log("Isolated char"); | |
toReturn += String.fromCharCode(arabicTranslations[tmpCharCode][ISOLATED_FORM]); | |
initial = true; | |
final = false; | |
} else if(initial) { | |
//console.log("Initial char"); | |
toReturn += String.fromCharCode(arabicTranslations[tmpCharCode][INITIAL_FORM]); | |
initial = false; | |
} else if(final) { | |
//console.log("Final char"); | |
toReturn += String.fromCharCode(arabicTranslations[tmpCharCode][FINAL_FORM]); | |
initial = true; | |
final = false; | |
} else { | |
//console.log("Median char"); | |
toReturn += String.fromCharCode(arabicTranslations[tmpCharCode][MEDIAL_FORM]); | |
} | |
//If this is a non-connector, the next character must be initial | |
if(nonConnectors.indexOf(tmpCharCode) > -1) { | |
initial = true; | |
} | |
} | |
return toReturn; | |
}; | |
var test = "السنجاب حيوان من القوارض، يعيش غالبا على الأشجار، وله ذيل كثيف وكبير"; | |
var newTest = getRealCharCodes(test); | |
//console.log(test); | |
for(var x = 0; x < test.length; x++) { | |
//console.log(test.charCodeAt(x)); | |
//console.log(test.charAt(x)); | |
} | |
console.log(newTest); | |
for(var x = 0; x < newTest.length; x++) { | |
console.log(newTest.charCodeAt(x)); | |
console.log(newTest.charAt(x)); | |
} |
For someone who doesn't understand Arabic this is an absolute goldmine. I'm currently trying to convert UCS-2 into CP864 for printing but first need to get the correct presentation form charcode of each character.
Thanks very much for sharing this sample.
If it is okay I might go ahead and submit a pull request to:
https://github.com/ahmads/arabicString
And see if the developer is interested in implementing this function.
Hi, can I use your javascript code freely?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Messing around with properly pulling an Arabic letter out to print one at a time in the correct format (browsers whitewash all that usually).