Skip to content

Instantly share code, notes, and snippets.

@thesnarky1
Created April 6, 2014 22:09
Show Gist options
  • Save thesnarky1/10012004 to your computer and use it in GitHub Desktop.
Save thesnarky1/10012004 to your computer and use it in GitHub Desktop.
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>JS Bin</title>
</head>
<body>
</body>
</html>
var NICE_FORM = 0;
var INITIAL_FORM = 1;
var MEDIAL_FORM = 2;
var FINAL_FORM = 3;
var ISOLATED_FORM = 4;
var arabicTranslations = {};
//Alif
arabicTranslations[1575] = [1575, 65165, 65166, 65166, 65165];
//Ba
arabicTranslations[1576] = [1576, 65169, 65170, 65168, 65167];
//Ta
arabicTranslations[1578] = [1578, 65175, 65176, 65174, 65173];
//Tha
arabicTranslations[1579] = [1579, 65179, 65180, 65178, 65177];
//Jim
arabicTranslations[1580] = [1580, 65183, 65184, 65182, 65181];
//Ha
arabicTranslations[1581] = [1581, 65187, 65188, 65186, 65185];
//Kha
arabicTranslations[1582] = [1582, 65191, 65192, 65190, 65189];
//Dal
arabicTranslations[1583] = [1583, 65193, 65194, 65194, 65193];
//Thal
arabicTranslations[1584] = [1584, 65195, 65196, 65196, 65195];
//Ra
arabicTranslations[1585] = [1585, 65197, 65198, 65198, 65195];
//Zain
arabicTranslations[1586] = [1586, 65199, 65200, 65200, 65199];
//Seen
arabicTranslations[1587] = [1587, 65203, 65204, 65202, 65201];
//Sheen
arabicTranslations[1588] = [1588, 65207, 65208, 65206, 65205];
//Sod
arabicTranslations[1589] = [1589, 65211, 65212, 65210, 65209];
//Dod
arabicTranslations[1590] = [1590, 65215, 65216, 65214, 65213];
//Tah
arabicTranslations[1591] = [1591, 65219, 65220, 65218, 65217];
//Thah
arabicTranslations[1592] = [1592, 65223, 65224, 65222, 65221];
//Ayn
arabicTranslations[1593] = [1593, 65227, 65228, 65224, 65225];
//Ghayn
arabicTranslations[1594] = [1594, 65231, 65232, 65230, 65229];
//Fah
arabicTranslations[1601] = [1601, 65235, 65236, 65234, 65233];
//Qaf
arabicTranslations[1602] = [1602, 65239, 65240, 65238, 65237];
//Kaf
arabicTranslations[1603] = [1603, 65243, 65244, 65242, 65241];
//Lam
arabicTranslations[1604] = [1604, 65247, 65248, 65246, 65245];
//Mim
arabicTranslations[1605] = [1605, 65251, 65252, 65250, 65249];
//Nun
arabicTranslations[1606] = [1606, 65255, 65256, 65254, 65253];
//Heh
arabicTranslations[1607] = [1607, 65259, 65260, 65258, 65257];
//Waw
arabicTranslations[1608] = [1608, 65261, 65262, 65262, 65261];
//Ya
arabicTranslations[1610] = [1610, 65267, 65268, 65266, 65265];
var nonConnectors = [1575, 1583, 1584, 1585, 1586, 1608];
var getRealCharCodes = function(str) {
//Can't change an empty or one-char string
if(str.length === 0 || str.length == 1) { console.log("Empty string"); return str; }
//No arabic in here to change, let's be quick about it
if(!/[\u0600-\u06FF]/.test(str)) { console.log("No arabic here"); return str; }
//console.log("Changing " + str);
var toReturn = "";
var initial = true;
var final = false;
for(var x = 0; x < str.length; x++) {
var tmpCharCode = str.charCodeAt(x);
var tmpChar = str.charAt(x);
//console.log("Checking " + tmpChar);
//We have no way to translate
if(arabicTranslations[tmpCharCode] === undefined) { toReturn += tmpChar; console.log("Skipping unknown character: " + tmpChar + "-" + tmpCharCode); continue; }
//If we're the last letter, we must be final
if(x == str.length - 1) {
final = true;
//Or if the next letter after us is not an Arabic letter we know how to deal with
} else if(arabicTranslations[str.charCodeAt(x + 1)] === undefined) {
final = true;
}
//Add this character
if(initial && final) {
//console.log("Isolated char");
toReturn += String.fromCharCode(arabicTranslations[tmpCharCode][ISOLATED_FORM]);
initial = true;
final = false;
} else if(initial) {
//console.log("Initial char");
toReturn += String.fromCharCode(arabicTranslations[tmpCharCode][INITIAL_FORM]);
initial = false;
} else if(final) {
//console.log("Final char");
toReturn += String.fromCharCode(arabicTranslations[tmpCharCode][FINAL_FORM]);
initial = true;
final = false;
} else {
//console.log("Median char");
toReturn += String.fromCharCode(arabicTranslations[tmpCharCode][MEDIAL_FORM]);
}
//If this is a non-connector, the next character must be initial
if(nonConnectors.indexOf(tmpCharCode) > -1) {
initial = true;
}
}
return toReturn;
};
var test = "السنجاب حيوان من القوارض، يعيش غالبا على الأشجار، وله ذيل كثيف وكبير";
var newTest = getRealCharCodes(test);
//console.log(test);
for(var x = 0; x < test.length; x++) {
//console.log(test.charCodeAt(x));
//console.log(test.charAt(x));
}
console.log(newTest);
for(var x = 0; x < newTest.length; x++) {
console.log(newTest.charCodeAt(x));
console.log(newTest.charAt(x));
}
@micwallace
Copy link

For someone who doesn't understand Arabic this is an absolute goldmine. I'm currently trying to convert UCS-2 into CP864 for printing but first need to get the correct presentation form charcode of each character.

Thanks very much for sharing this sample.

If it is okay I might go ahead and submit a pull request to:
https://github.com/ahmads/arabicString
And see if the developer is interested in implementing this function.

@kergalym
Copy link

Hi, can I use your javascript code freely?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment